diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py new file mode 100644 index 0000000000..f92422474c --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/_bin_format.py @@ -0,0 +1,152 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +On-disk header helpers for the cuvs-bench binary file format. + +cuvs-bench inherits the big-ann-benchmarks binary layout: a small header +listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols`` +array of the dtype implied by the file extension. Two layouts are supported: + +- **Legacy**: ``[uint32 n_rows, uint32 n_cols, data ...]`` (8-byte header). + This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin`` + / ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today. + +- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]`` (16-byte header). + For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B). + +Detection is **size-based**: a well-formed cuvs-bench binary is exactly +``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes +of the file and: + +1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte + header). The layout is accepted if ``8 + n_rows * n_cols * itemsize`` + matches the on-disk file size. +2. Otherwise tries the extended layout (first 16 bytes as two + ``uint64``s, 16-byte header). Accepted if + ``16 + n_rows * n_cols * itemsize`` matches the file size instead. +3. If neither layout matches, raises ``ValueError`` -- the file is + truncated, padded, or has a mismatched dtype extension. +""" + +from __future__ import annotations + +import os +import struct +from typing import BinaryIO, Tuple + +import numpy as np + +UINT32_MAX = (1 << 32) - 1 + +LEGACY_HEADER_BYTES = 8 +EXTENDED_HEADER_BYTES = 16 + + +def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]: + """Read the header of a cuvs-bench binary file. + + Auto-detects the on-disk layout from the file size by checking which + of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64 + header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize`` + balance. + + Parameters + ---------- + path : str + Path to the binary file. + itemsize : int + Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for + ``int8``) used for the size-equation check. + + Returns + ------- + (n_rows, n_cols, header_bytes) : Tuple[int, int, int] + Row count, column count, and the number of bytes the header + occupies on disk (``8`` for legacy, ``16`` for extended). + + Raises + ------ + ValueError + If neither the legacy nor the extended interpretation matches. + FileNotFoundError + If ``path`` does not exist. + """ + if itemsize < 1: + raise ValueError( + f"itemsize must be a positive integer, got {itemsize!r}" + ) + file_size = os.path.getsize(path) + with open(path, "rb") as f: + head = f.read(EXTENDED_HEADER_BYTES) + + if len(head) < LEGACY_HEADER_BYTES: + raise ValueError( + f"File too small to contain a valid header (expected at least " + f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}" + ) + + n_rows_32, n_cols_32 = struct.unpack(" int: + """Write the canonical cuvs-bench binary header at the current position. + + The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and + ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used + otherwise, or when explicitly requested via ``size_dtype=np.uint64``. + + Parameters + ---------- + f : BinaryIO + Open binary file handle, positioned where the header should go. + n_rows, n_cols : int + Header values to write. Must be non-negative. + size_dtype : numpy dtype + ``np.uint32`` for the legacy 8-byte header (default), or + ``np.uint64`` to force the extended 16-byte header. + + Returns + ------- + int + Number of bytes written (``8`` for legacy, ``16`` for extended). + """ + if n_rows < 0 or n_cols < 0: + raise ValueError( + f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})" + ) + use_uint64 = ( + np.dtype(size_dtype) == np.uint64 + or n_rows > UINT32_MAX + or n_cols > UINT32_MAX + ) + if use_uint64: + f.write(struct.pack(" np.ndarray: """ Read a binary vector file into a numpy array. - Supports the standard big-ann-bench binary format used by cuvs-bench - datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``, - followed by ``n_rows * n_cols`` elements of the dtype inferred from - the file extension via ``dtype_from_filename``. + Supports the cuvs-bench binary format with either the legacy 8-byte + ``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte + ``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more + than ``UINT32_MAX`` rows or columns. The layout is auto-detected from + the file size by :func:`cuvs_bench._bin_format.read_bin_header`. Parameters ---------- path : str Path to the binary file. The dtype is inferred from the extension: ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8), - ``.i8bin`` (int8), ``.ibin`` (int32). + ``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64). subset_size : Optional[int] If provided, only the first ``subset_size`` rows are loaded. @@ -93,27 +98,24 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray: or the file is truncated. """ dtype = dtype_from_filename(path) - if subset_size is not None and subset_size < 1: + itemsize = np.dtype(dtype).itemsize + if subset_size is not None and ( + isinstance(subset_size, float) or subset_size < 1 + ): raise ValueError( f"subset_size must be a positive integer, got {subset_size}" ) + n_rows, n_cols, header_bytes = read_bin_header(path, itemsize) + if subset_size is not None: + n_rows = min(n_rows, subset_size) with open(path, "rb") as f: - header = f.read(8) - if len(header) < 8: - raise ValueError( - f"File too small to contain a valid header (expected 8 bytes, " - f"got {len(header)}): {path}" - ) - n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0]) - n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0]) - if subset_size is not None: - n_rows = min(n_rows, subset_size) - expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize + f.seek(header_bytes) + expected_bytes = n_rows * n_cols * itemsize raw = f.read(expected_bytes) if len(raw) < expected_bytes: raise ValueError( f"File is truncated: expected {expected_bytes} bytes of data " - f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), " + f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), " f"got {len(raw)}: {path}" ) data = np.frombuffer(raw, dtype=dtype) diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index 43c03f4322..4315c8e3ac 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -9,7 +9,14 @@ import sys import warnings -from .utils import memmap_bin_file, suffix_from_dtype, write_bin +from .utils import ( + groundtruth_neighbors_filename, + memmap_bin_file, + offset_neighbor_indices, + suffix_from_dtype, + write_bin, + write_groundtruth_neighbors, +) def import_with_fallback(primary_lib, secondary_lib=None, alias=None): @@ -193,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): D, Ind = cpu_search(X, queries, k, metric=metric) D, Ind = xp.asarray(D), xp.asarray(Ind) - Ind += i # shift neighbor index by offset i + Ind = offset_neighbor_indices(Ind, i, n_samples) if distances is None: distances = D @@ -358,9 +365,11 @@ def main(): print("Calculating true nearest neighbors") distances, indices = calc_truth(dataset, queries, args.k, args.metric) - write_bin( - os.path.join(args.output, "groundtruth.neighbors.ibin"), - indices.astype(xp.uint32), + n_base = dataset.shape[0] + write_groundtruth_neighbors( + os.path.join(args.output, groundtruth_neighbors_filename(n_base)), + indices, + n_base, ) write_bin( os.path.join(args.output, "groundtruth.distances.fbin"), diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py index 72fb5b4a07..d3ee4b3479 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -7,6 +7,8 @@ import numpy as np +from cuvs_bench._bin_format import read_bin_header, write_bin_header + def dtype_from_filename(filename): ext = os.path.splitext(filename)[1] @@ -16,6 +18,8 @@ def dtype_from_filename(filename): return np.float16 elif ext == ".ibin": return np.int32 + elif ext == ".u64bin": + return np.uint64 elif ext == ".u8bin": return np.ubyte elif ext == ".i8bin": @@ -31,6 +35,8 @@ def suffix_from_dtype(dtype): return ".hbin" elif dtype == np.int32: return ".ibin" + elif dtype == np.uint64: + return ".u64bin" elif dtype == np.ubyte: return ".u8bin" elif dtype == np.byte: @@ -39,30 +45,102 @@ def suffix_from_dtype(dtype): raise RuntimeError("Not supported dtype extension" + dtype) +def neighbor_index_dtype(n_base: int) -> np.dtype: + """Return the dtype used to store neighbor row IDs for a base set size.""" + if n_base > np.iinfo(np.int32).max: + return np.uint64 + return np.int32 + + +def neighbor_index_accumulator_dtype(n_base: int) -> np.dtype: + """Return the in-memory dtype for neighbor IDs during GT computation. + + cuVS brute-force search returns ``int64`` neighbors. Use ``int64`` for + large bases so batch offsets up to multi-billion row counts do not + overflow; cast to :func:`neighbor_index_dtype` only when writing files. + """ + if n_base > np.iinfo(np.int32).max: + return np.int64 + return np.int32 + + +def groundtruth_neighbors_filename(n_base: int) -> str: + """Return the ground-truth neighbors filename for a base set size.""" + if n_base > np.iinfo(np.int32).max: + return "groundtruth.neighbors.u64bin" + return "groundtruth.neighbors.ibin" + + +def offset_neighbor_indices(indices, batch_offset: int, n_base: int): + """Shift local neighbor IDs by a batch offset without integer overflow.""" + dtype = neighbor_index_accumulator_dtype(n_base) + return indices.astype(dtype) + batch_offset + + +def write_groundtruth_neighbors(path, indices, n_base: int): + """Write a ground-truth neighbor matrix using the correct on-disk dtype.""" + storage_dtype = neighbor_index_dtype(n_base) + data = np.asarray(indices, dtype=storage_dtype) + write_bin(path, data) + + def memmap_bin_file( bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 ): - extent_itemsize = np.dtype(size_dtype).itemsize - offset = int(extent_itemsize) * 2 + """Memory-map a cuvs-bench binary file. + + Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and + the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read + mode the layout is auto-detected from the file size; in write mode the + legacy layout is used unless ``size_dtype=np.uint64`` or one of the shape + dimensions exceeds ``UINT32_MAX``. + + Parameters + ---------- + bin_file : str or None + Path to the binary file. ``None`` short-circuits and returns ``None`` + (preserves the historical "skip optional file" behavior). + dtype : numpy dtype or None + Element dtype. If ``None``, inferred from the file extension via + :func:`dtype_from_filename`. + shape : tuple or None + Read mode: optionally override ``(n_rows, n_cols)`` from the header; + any ``None`` entries are filled in from the header value. Write mode: + required ``(n_rows, n_cols)`` of the file to create. + mode : str + Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``). + size_dtype : numpy dtype + Write mode only: ``np.uint32`` for the legacy 8-byte header (default), + or ``np.uint64`` to force the extended 16-byte header. Ignored in read + mode (auto-detected). + """ if bin_file is None: return None if dtype is None: dtype = dtype_from_filename(bin_file) + itemsize = np.dtype(dtype).itemsize + + if shape is not None and len(shape) != 2: + raise ValueError( + f"shape must have exactly 2 dimensions (n_rows, n_cols), got {shape!r}" + ) if mode[0] == "r": - a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize) if shape is None: - shape = (a[0], a[1]) + final_shape = (n_rows, n_cols) else: - shape = tuple( - [ - aval if sval is None else sval - for aval, sval in zip(a, shape) - ] + header_dims = (n_rows, n_cols) + final_shape = tuple( + aval if sval is None else sval + for aval, sval in zip(header_dims, shape) ) - return np.memmap( - bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape + bin_file, + mode=mode, + dtype=dtype, + offset=header_bytes, + shape=final_shape, ) elif mode[0] == "w": if shape is None: @@ -72,19 +150,22 @@ def memmap_bin_file( dirname = os.path.dirname(bin_file) if len(dirname) > 0: os.makedirs(dirname, exist_ok=True) - a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) - a[0] = shape[0] - a[1] = shape[1] - a.flush() - del a - fp = np.memmap( - bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape + with open(bin_file, "wb") as f: + header_bytes = write_bin_header( + f, shape[0], shape[1], size_dtype=size_dtype + ) + return np.memmap( + bin_file, + mode="r+", + dtype=dtype, + offset=header_bytes, + shape=shape, ) - return fp def write_bin(fname, data): + """Write a 2-D numpy array to a cuvs-bench binary file.""" print("writing", fname, data.shape, data.dtype, "...") with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py index 5647ece771..0fba915cae 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 @@ -9,21 +9,27 @@ import numpy as np +from cuvs_bench._bin_format import read_bin_header, write_bin_header + def read_fbin(fname): - shape = np.fromfile(fname, dtype=np.uint32, count=2) - if float(shape[0]) * shape[1] * 4 > 2_000_000_000: - data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape( - shape - ) + itemsize = np.dtype(np.float32).itemsize + n_rows, n_cols, header_bytes = read_bin_header(fname, itemsize) + shape = (n_rows, n_cols) + if float(n_rows) * n_cols * itemsize > 2_000_000_000: + data = np.memmap( + fname, dtype=np.float32, offset=header_bytes, mode="r" + ).reshape(shape) else: - data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape) + data = np.fromfile( + fname, dtype=np.float32, offset=header_bytes + ).reshape(shape) return data def write_bin(fname, data): with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py index af6a7aac31..2998a5dcb2 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 @@ -8,6 +8,8 @@ import h5py import numpy as np +from cuvs_bench._bin_format import write_bin_header + def normalize(x): norm = np.linalg.norm(x, axis=1) @@ -16,7 +18,7 @@ def normalize(x): def write_bin(fname, data): with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index db1fd7b137..01dd803a07 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -6,10 +6,19 @@ Unit tests for shared backend utilities and Dataset transparent loading. """ +import struct + import numpy as np import pytest import yaml +from cuvs_bench._bin_format import ( + EXTENDED_HEADER_BYTES, + LEGACY_HEADER_BYTES, + UINT32_MAX, + read_bin_header, + write_bin_header, +) from cuvs_bench.backends import Dataset from cuvs_bench.backends._utils import ( compute_recall, @@ -17,13 +26,23 @@ expand_param_grid, load_vectors, ) +from cuvs_bench.generate_groundtruth.utils import ( + groundtruth_neighbors_filename, + memmap_bin_file, + neighbor_index_accumulator_dtype, + neighbor_index_dtype, + offset_neighbor_indices, + write_groundtruth_neighbors, +) from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader -def _write_test_bin(path, data): - """Write a numpy array in big-ann-bench binary format.""" +def _write_test_bin(path, data, *, size_dtype=np.uint32): + """Write a numpy array in cuvs-bench binary format.""" with open(path, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header( + f, data.shape[0], data.shape[1], size_dtype=size_dtype + ) data.tofile(f) @@ -42,6 +61,10 @@ def test_ibin(self): """Test .ibin maps to int32.""" assert dtype_from_filename("groundtruth.ibin") == np.int32 + def test_u64bin(self): + """Test .u64bin maps to uint64.""" + assert dtype_from_filename("groundtruth.neighbors.u64bin") == np.uint64 + def test_u8bin(self): """Test .u8bin maps to uint8.""" assert dtype_from_filename("vectors.u8bin") == np.ubyte @@ -170,7 +193,7 @@ def test_truncated_data(self, tmp_path): np.array([10, 4], dtype=np.uint32).tofile(f) np.random.rand(5, 4).astype(np.float32).tofile(f) - with pytest.raises(ValueError, match="File is truncated"): + with pytest.raises(ValueError, match="does not match either"): load_vectors(path) def test_file_not_found(self): @@ -178,6 +201,322 @@ def test_file_not_found(self): with pytest.raises(FileNotFoundError): load_vectors("/nonexistent/path/vectors.fbin") + def test_load_uint64_header(self, tmp_path): + """``load_vectors`` reads files written with the extended uint64 header.""" + data = np.random.rand(40, 16).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data, size_dtype=np.uint64) + + # Sanity check: file really uses the extended layout. + assert ( + tmp_path.joinpath("test.fbin").stat().st_size + == EXTENDED_HEADER_BYTES + data.nbytes + ) + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_load_uint64_header_with_subset(self, tmp_path): + """``subset_size`` works regardless of which header layout was used.""" + data = np.random.rand(50, 8).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data, size_dtype=np.uint64) + + loaded = load_vectors(path, subset_size=12) + assert loaded.shape == (12, 8) + np.testing.assert_array_equal(loaded, data[:12]) + + @pytest.mark.parametrize( + "ext, dtype, size_dtype", + [ + (".fbin", np.float32, np.uint32), + (".fbin", np.float32, np.uint64), + (".f16bin", np.float16, np.uint32), + (".f16bin", np.float16, np.uint64), + (".ibin", np.int32, np.uint32), + (".ibin", np.int32, np.uint64), + (".u64bin", np.uint64, np.uint32), + (".u64bin", np.uint64, np.uint64), + (".u8bin", np.uint8, np.uint32), + (".u8bin", np.uint8, np.uint64), + (".i8bin", np.int8, np.uint32), + (".i8bin", np.int8, np.uint64), + ], + ) + def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype): + """Round-trip every supported dtype through both header layouts.""" + if np.issubdtype(dtype, np.integer): + info = np.iinfo(dtype) + data = np.random.randint( + info.min, info.max, size=(25, 7), dtype=dtype + ) + if dtype == np.uint64: + data[0, 0] = np.iinfo(np.int32).max + 42 + else: + data = np.random.rand(25, 7).astype(dtype) + path = str(tmp_path / f"test{ext}") + _write_test_bin(path, data, size_dtype=size_dtype) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + +class TestGroundtruthNeighborFormat: + """Tests for large-base ground-truth neighbor index format selection.""" + + def test_neighbor_index_dtype_small_base(self): + assert neighbor_index_dtype(1_000_000) == np.int32 + + def test_neighbor_index_dtype_large_base(self): + assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64 + + def test_neighbor_index_accumulator_dtype_large_base(self): + assert ( + neighbor_index_accumulator_dtype(np.iinfo(np.int32).max + 1) + == np.int64 + ) + + def test_groundtruth_neighbors_filename_small_base(self): + assert ( + groundtruth_neighbors_filename(1_000_000) + == "groundtruth.neighbors.ibin" + ) + + def test_groundtruth_neighbors_filename_large_base(self): + assert ( + groundtruth_neighbors_filename(np.iinfo(np.int32).max + 1) + == "groundtruth.neighbors.u64bin" + ) + + def test_load_u64bin_preserves_large_indices(self, tmp_path): + """uint64 GT files preserve neighbor IDs above INT32_MAX.""" + large_id = np.iinfo(np.int32).max + 12345 + indices = np.array([[large_id, 0, 1]], dtype=np.uint64) + path = str(tmp_path / "gt.u64bin") + _write_test_bin(path, indices) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, indices) + + def test_offset_neighbor_indices_small_base(self): + local = np.array([[0, 1, 2]], dtype=np.uint32) + offset = offset_neighbor_indices(local, 1000, 1_000_000) + assert offset.dtype == np.int32 + np.testing.assert_array_equal(offset, [[1000, 1001, 1002]]) + + def test_offset_neighbor_indices_large_batch_offset(self): + """Search-local IDs must not wrap when batch offset exceeds INT32_MAX.""" + batch_offset = np.iinfo(np.int32).max + 1 + n_base = batch_offset + 10 + local = np.array([[0, 1, 2]], dtype=np.int64) + offset = offset_neighbor_indices(local, batch_offset, n_base) + assert offset.dtype == np.int64 + np.testing.assert_array_equal( + offset, + [[batch_offset, batch_offset + 1, batch_offset + 2]], + ) + + def test_write_groundtruth_neighbors_round_trip(self, tmp_path): + """GT write/load preserves neighbor IDs above INT32_MAX.""" + n_base = np.iinfo(np.int32).max + 1 + large_id = n_base + 999 + indices = np.array([[large_id, large_id - 1, 0]], dtype=np.int64) + path = str(tmp_path / groundtruth_neighbors_filename(n_base)) + write_groundtruth_neighbors(path, indices, n_base) + + loaded = load_vectors(path) + assert loaded.dtype == np.uint64 + np.testing.assert_array_equal(loaded, indices.astype(np.uint64)) + + def test_dataset_lazy_load_u64bin_groundtruth(self, tmp_path): + """Dataset loads .u64bin ground truth with IDs above INT32_MAX.""" + large_id = np.iinfo(np.int32).max + 12345 + gt = np.array([[large_id, 0, 1]], dtype=np.uint64) + path = str(tmp_path / "groundtruth.neighbors.u64bin") + _write_test_bin(path, gt) + + dataset = Dataset(name="test", groundtruth_neighbors_file=path) + np.testing.assert_array_equal(dataset.groundtruth_neighbors, gt) + + +class TestBinHeaderHelpers: + """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``.""" + + def test_write_legacy_returns_8_bytes(self, tmp_path): + """Small shapes should write the 8-byte uint32 header by default.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, 7, 3) + assert n == LEGACY_HEADER_BYTES + assert path.stat().st_size == LEGACY_HEADER_BYTES + + def test_write_size_dtype_uint64_returns_16_bytes(self, tmp_path): + """``size_dtype=np.uint64`` should write the 16-byte uint64 header.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, 7, 3, size_dtype=np.uint64) + assert n == EXTENDED_HEADER_BYTES + assert path.stat().st_size == EXTENDED_HEADER_BYTES + + def test_write_auto_promotes_to_uint64_when_overflowing(self, tmp_path): + """Shapes that don't fit in uint32 should auto-promote to uint64.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, UINT32_MAX + 1, 4) + assert n == EXTENDED_HEADER_BYTES + + def test_write_negative_raises(self, tmp_path): + """Negative dimensions are rejected.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + with pytest.raises(ValueError, match="non-negative"): + write_bin_header(f, -1, 4) + + def test_read_legacy_round_trip(self, tmp_path): + """Legacy round-trip: write 8-byte header, read it back.""" + path = tmp_path / "x.fbin" + data = np.random.rand(11, 5).astype(np.float32) + with open(path, "wb") as f: + write_bin_header(f, data.shape[0], data.shape[1]) + data.tofile(f) + n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert (n_rows, n_cols, hbytes) == (11, 5, LEGACY_HEADER_BYTES) + + def test_read_extended_round_trip(self, tmp_path): + """Extended round-trip: write 16-byte header, read it back.""" + path = tmp_path / "x.fbin" + data = np.random.rand(11, 5).astype(np.float32) + with open(path, "wb") as f: + write_bin_header( + f, data.shape[0], data.shape[1], size_dtype=np.uint64 + ) + data.tofile(f) + n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES) + + def test_read_synthesized_huge_extended_header(self, tmp_path): + """Extended-header file with >UINT32_MAX rows and positive n_cols. + + We can't materialize the full data section, so write the header and + truncate to the exact file size ``read_bin_header`` expects. + """ + path = tmp_path / "huge.fbin" + n_rows = UINT32_MAX + 17 + n_cols = 4 + itemsize = 4 + expected_size = EXTENDED_HEADER_BYTES + n_rows * n_cols * itemsize + with open(path, "wb") as f: + write_bin_header(f, n_rows, n_cols) + f.truncate(expected_size) + + assert path.stat().st_size == expected_size + got_rows, got_cols, hbytes = read_bin_header( + str(path), itemsize=itemsize + ) + assert got_rows == n_rows + assert got_cols == n_cols + assert hbytes == EXTENDED_HEADER_BYTES + + def test_read_file_too_small_raises(self, tmp_path): + """A file shorter than the legacy header raises a clear error.""" + path = tmp_path / "x.fbin" + path.write_bytes(b"\x00\x00\x00") + with pytest.raises(ValueError, match="File too small"): + read_bin_header(str(path), itemsize=4) + + def test_read_size_mismatch_raises(self, tmp_path): + """Header values that don't balance the file size are rejected.""" + path = tmp_path / "x.fbin" + with open(path, "wb") as f: + f.write(struct.pack("