From c1df1175595edf8cdf187900327bf018765c7a13 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Tue, 26 May 2026 20:24:21 +0000
Subject: [PATCH 1/6] uint64 header support based on size

---
 python/cuvs_bench/cuvs_bench/_bin_format.py   | 144 ++++++++++++++
 .../cuvs_bench/cuvs_bench/backends/_utils.py  |  30 ++-
 .../cuvs_bench/generate_groundtruth/utils.py  |  85 ++++++---
 .../cuvs_bench/get_dataset/fbin_to_f16bin.py  |  26 ++-
 .../cuvs_bench/get_dataset/hdf5_to_fbin.py    |  10 +-
 .../cuvs_bench/cuvs_bench/tests/test_utils.py | 180 +++++++++++++++++-
 6 files changed, 420 insertions(+), 55 deletions(-)
 create mode 100644 python/cuvs_bench/cuvs_bench/_bin_format.py

diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py
new file mode 100644
index 0000000000..2611020fc5
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/_bin_format.py
@@ -0,0 +1,144 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+"""
+On-disk header helpers for the cuvs-bench binary file format.
+
+cuvs-bench inherits the big-ann-benchmarks binary layout: a small header
+listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols``
+array of the dtype implied by the file extension. Two layouts are supported:
+
+- **Legacy**:  ``[uint32 n_rows, uint32 n_cols, data ...]``  (8-byte header).
+  This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin``
+  / ``.f16bin`` / ``.hbin`` file on disk uses today.
+
+- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]``  (16-byte header).
+  For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B).
+
+Detection is **size-based**: a well-formed cuvs-bench binary is exactly
+``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes
+of the file and:
+
+1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte
+   header). The layout is accepted if ``8 + n_rows * n_cols * itemsize``
+   matches the on-disk file size.
+2. Otherwise tries the extended layout (first 16 bytes as two
+   ``uint64``s, 16-byte header). Accepted if
+   ``16 + n_rows * n_cols * itemsize`` matches the file size instead.
+3. If neither layout matches, raises ``ValueError`` -- the file is
+   truncated, padded, or has a mismatched dtype extension.
+"""
+
+from __future__ import annotations
+
+import os
+import struct
+from typing import BinaryIO, Tuple
+
+UINT32_MAX = (1 << 32) - 1
+
+LEGACY_HEADER_BYTES = 8
+EXTENDED_HEADER_BYTES = 16
+
+
+def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
+    """Read the header of a cuvs-bench binary file.
+
+    Auto-detects the on-disk layout from the file size by checking which
+    of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64
+    header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize``
+    balance.
+
+    Parameters
+    ----------
+    path : str
+        Path to the binary file.
+    itemsize : int
+        Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for
+        ``int8``). Used purely for the size-equation check; the file
+        contents are not inspected.
+
+    Returns
+    -------
+    (n_rows, n_cols, header_bytes) : Tuple[int, int, int]
+        Row count, column count, and the number of bytes the header
+        occupies on disk (``8`` for legacy, ``16`` for extended). Callers
+        seeking to the data start should use ``header_bytes`` rather than
+        a hardcoded offset.
+
+    Raises
+    ------
+    ValueError
+        If neither the legacy nor the extended interpretation matches.
+    FileNotFoundError
+        If ``path`` does not exist.
+    """
+    file_size = os.path.getsize(path)
+    with open(path, "rb") as f:
+        head = f.read(EXTENDED_HEADER_BYTES)
+
+    if len(head) < LEGACY_HEADER_BYTES:
+        raise ValueError(
+            f"File too small to contain a valid header (expected at least "
+            f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}"
+        )
+
+    n_rows_32, n_cols_32 = struct.unpack("<II", head[:LEGACY_HEADER_BYTES])
+    if file_size == LEGACY_HEADER_BYTES + n_rows_32 * n_cols_32 * itemsize:
+        return int(n_rows_32), int(n_cols_32), LEGACY_HEADER_BYTES
+
+    if len(head) == EXTENDED_HEADER_BYTES:
+        n_rows_64, n_cols_64 = struct.unpack("<QQ", head)
+        if (
+            file_size
+            == EXTENDED_HEADER_BYTES + n_rows_64 * n_cols_64 * itemsize
+        ):
+            return int(n_rows_64), int(n_cols_64), EXTENDED_HEADER_BYTES
+
+    raise ValueError(
+        f"File size {file_size:,} bytes does not match either the legacy "
+        f"(8-byte uint32) or extended (16-byte uint64) header layout for "
+        f"itemsize={itemsize}: {path}. The file may be truncated, padded, "
+        f"or have a mismatched dtype extension."
+    )
+
+
+def write_bin_header(
+    f: BinaryIO,
+    n_rows: int,
+    n_cols: int,
+    *,
+    force_uint64: bool = False,
+) -> int:
+    """Write the canonical cuvs-bench binary header at the current position.
+
+    The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and
+    ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used
+    otherwise, or when explicitly requested via ``force_uint64=True``.
+
+    Parameters
+    ----------
+    f : BinaryIO
+        Open binary file handle, positioned where the header should go.
+    n_rows, n_cols : int
+        Header values to write. Must be non-negative.
+    force_uint64 : bool
+        If ``True``, always write the 16-byte uint64 layout regardless of
+        whether the values fit in ``uint32``. Defaults to ``False``.
+
+    Returns
+    -------
+    int
+        Number of bytes written (``8`` for legacy, ``16`` for extended).
+    """
+    if n_rows < 0 or n_cols < 0:
+        raise ValueError(
+            f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})"
+        )
+    if force_uint64 or n_rows > UINT32_MAX or n_cols > UINT32_MAX:
+        f.write(struct.pack("<QQ", int(n_rows), int(n_cols)))
+        return EXTENDED_HEADER_BYTES
+    f.write(struct.pack("<II", int(n_rows), int(n_cols)))
+    return LEGACY_HEADER_BYTES
diff --git a/python/cuvs_bench/cuvs_bench/backends/_utils.py b/python/cuvs_bench/cuvs_bench/backends/_utils.py
index b931f3128b..a85d57c94a 100644
--- a/python/cuvs_bench/cuvs_bench/backends/_utils.py
+++ b/python/cuvs_bench/cuvs_bench/backends/_utils.py
@@ -27,6 +27,8 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header
+
 
 def dtype_from_filename(filename):
     """Map file extension to numpy dtype.
@@ -65,10 +67,11 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
     """
     Read a binary vector file into a numpy array.
 
-    Supports the standard big-ann-bench binary format used by cuvs-bench
-    datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
-    followed by ``n_rows * n_cols`` elements of the dtype inferred from
-    the file extension via ``dtype_from_filename``.
+    Supports the cuvs-bench binary format with either the legacy 8-byte
+    ``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte
+    ``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more
+    than ``UINT32_MAX`` rows or columns. The layout is auto-detected from
+    the file size by :func:`cuvs_bench._bin_format.read_bin_header`.
 
     Parameters
     ----------
@@ -93,27 +96,22 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
         or the file is truncated.
     """
     dtype = dtype_from_filename(path)
+    itemsize = np.dtype(dtype).itemsize
     if subset_size is not None and subset_size < 1:
         raise ValueError(
             f"subset_size must be a positive integer, got {subset_size}"
         )
+    n_rows, n_cols, header_bytes = read_bin_header(path, itemsize)
+    if subset_size is not None:
+        n_rows = min(n_rows, subset_size)
     with open(path, "rb") as f:
-        header = f.read(8)
-        if len(header) < 8:
-            raise ValueError(
-                f"File too small to contain a valid header (expected 8 bytes, "
-                f"got {len(header)}): {path}"
-            )
-        n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
-        n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
-        if subset_size is not None:
-            n_rows = min(n_rows, subset_size)
-        expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
+        f.seek(header_bytes)
+        expected_bytes = n_rows * n_cols * itemsize
         raw = f.read(expected_bytes)
         if len(raw) < expected_bytes:
             raise ValueError(
                 f"File is truncated: expected {expected_bytes} bytes of data "
-                f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
+                f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), "
                 f"got {len(raw)}: {path}"
             )
         data = np.frombuffer(raw, dtype=dtype)
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
index 72fb5b4a07..11f8ee1762 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 
@@ -7,6 +7,8 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header, write_bin_header
+
 
 def dtype_from_filename(filename):
     ext = os.path.splitext(filename)[1]
@@ -40,29 +42,56 @@ def suffix_from_dtype(dtype):
 
 
 def memmap_bin_file(
-    bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
+    bin_file, dtype, shape=None, mode="r", *, force_uint64=False
 ):
-    extent_itemsize = np.dtype(size_dtype).itemsize
-    offset = int(extent_itemsize) * 2
+    """Memory-map a cuvs-bench binary file.
+
+    Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and
+    the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read
+    mode the layout is auto-detected from the file size; in write mode the
+    legacy layout is used unless ``force_uint64=True`` or one of the shape
+    dimensions exceeds ``UINT32_MAX``.
+
+    Parameters
+    ----------
+    bin_file : str or None
+        Path to the binary file. ``None`` short-circuits and returns ``None``
+        (preserves the historical "skip optional file" behavior).
+    dtype : numpy dtype or None
+        Element dtype. If ``None``, inferred from the file extension via
+        :func:`dtype_from_filename`.
+    shape : tuple or None
+        Read mode: optionally override ``(n_rows, n_cols)`` from the header;
+        any ``None`` entries are filled in from the header value. Write mode:
+        required ``(n_rows, n_cols)`` of the file to create.
+    mode : str
+        Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``).
+    force_uint64 : bool
+        Write mode only: force the extended uint64 header even when the
+        shape would fit in uint32. Ignored in read mode (auto-detected).
+    """
     if bin_file is None:
         return None
     if dtype is None:
         dtype = dtype_from_filename(bin_file)
+    itemsize = np.dtype(dtype).itemsize
 
     if mode[0] == "r":
-        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize)
         if shape is None:
-            shape = (a[0], a[1])
+            final_shape = (n_rows, n_cols)
         else:
-            shape = tuple(
-                [
-                    aval if sval is None else sval
-                    for aval, sval in zip(a, shape)
-                ]
+            header_dims = (n_rows, n_cols)
+            final_shape = tuple(
+                aval if sval is None else sval
+                for aval, sval in zip(header_dims, shape)
             )
-
         return np.memmap(
-            bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
+            bin_file,
+            mode=mode,
+            dtype=dtype,
+            offset=header_bytes,
+            shape=final_shape,
         )
     elif mode[0] == "w":
         if shape is None:
@@ -72,19 +101,29 @@ def memmap_bin_file(
         dirname = os.path.dirname(bin_file)
         if len(dirname) > 0:
             os.makedirs(dirname, exist_ok=True)
-        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
-        a[0] = shape[0]
-        a[1] = shape[1]
-        a.flush()
-        del a
-        fp = np.memmap(
-            bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
+        with open(bin_file, "wb") as f:
+            header_bytes = write_bin_header(
+                f, shape[0], shape[1], force_uint64=force_uint64
+            )
+        return np.memmap(
+            bin_file,
+            mode="r+",
+            dtype=dtype,
+            offset=header_bytes,
+            shape=shape,
         )
-        return fp
 
 
-def write_bin(fname, data):
+def write_bin(fname, data, *, force_uint64=False):
+    """Write a 2-D numpy array to a cuvs-bench binary file.
+
+    The legacy 8-byte uint32 header is used by default; pass
+    ``force_uint64=True`` (or supply a shape with a dimension exceeding
+    ``UINT32_MAX``) to write the extended 16-byte uint64 header instead.
+    """
     print("writing", fname, data.shape, data.dtype, "...")
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(
+            f, data.shape[0], data.shape[1], force_uint64=force_uint64
+        )
         data.tofile(f)
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
index 5647ece771..19c14e8aa8 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -9,21 +9,29 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header, write_bin_header
+
 
 def read_fbin(fname):
-    shape = np.fromfile(fname, dtype=np.uint32, count=2)
-    if float(shape[0]) * shape[1] * 4 > 2_000_000_000:
-        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
-            shape
-        )
+    itemsize = np.dtype(np.float32).itemsize
+    n_rows, n_cols, header_bytes = read_bin_header(fname, itemsize)
+    shape = (n_rows, n_cols)
+    if float(n_rows) * n_cols * itemsize > 2_000_000_000:
+        data = np.memmap(
+            fname, dtype=np.float32, offset=header_bytes, mode="r"
+        ).reshape(shape)
     else:
-        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+        data = np.fromfile(
+            fname, dtype=np.float32, offset=header_bytes
+        ).reshape(shape)
     return data
 
 
-def write_bin(fname, data):
+def write_bin(fname, data, *, force_uint64=False):
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(
+            f, data.shape[0], data.shape[1], force_uint64=force_uint64
+        )
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
index af6a7aac31..34abacbf6d 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -8,15 +8,19 @@
 import h5py
 import numpy as np
 
+from cuvs_bench._bin_format import write_bin_header
+
 
 def normalize(x):
     norm = np.linalg.norm(x, axis=1)
     return (x.T / norm).T
 
 
-def write_bin(fname, data):
+def write_bin(fname, data, *, force_uint64=False):
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(
+            f, data.shape[0], data.shape[1], force_uint64=force_uint64
+        )
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index db1fd7b137..a009684723 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -6,10 +6,19 @@
 Unit tests for shared backend utilities and Dataset transparent loading.
 """
 
+import struct
+
 import numpy as np
 import pytest
 import yaml
 
+from cuvs_bench._bin_format import (
+    EXTENDED_HEADER_BYTES,
+    LEGACY_HEADER_BYTES,
+    UINT32_MAX,
+    read_bin_header,
+    write_bin_header,
+)
 from cuvs_bench.backends import Dataset
 from cuvs_bench.backends._utils import (
     compute_recall,
@@ -20,10 +29,12 @@
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
 
-def _write_test_bin(path, data):
-    """Write a numpy array in big-ann-bench binary format."""
+def _write_test_bin(path, data, *, force_uint64=False):
+    """Write a numpy array in cuvs-bench binary format."""
     with open(path, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(
+            f, data.shape[0], data.shape[1], force_uint64=force_uint64
+        )
         data.tofile(f)
 
 
@@ -170,7 +181,7 @@ def test_truncated_data(self, tmp_path):
             np.array([10, 4], dtype=np.uint32).tofile(f)
             np.random.rand(5, 4).astype(np.float32).tofile(f)
 
-        with pytest.raises(ValueError, match="File is truncated"):
+        with pytest.raises(ValueError, match="does not match either"):
             load_vectors(path)
 
     def test_file_not_found(self):
@@ -178,6 +189,167 @@ def test_file_not_found(self):
         with pytest.raises(FileNotFoundError):
             load_vectors("/nonexistent/path/vectors.fbin")
 
+    def test_load_uint64_header(self, tmp_path):
+        """``load_vectors`` reads files written with the extended uint64 header."""
+        data = np.random.rand(40, 16).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, force_uint64=True)
+
+        # Sanity check: file really uses the extended layout.
+        assert (
+            tmp_path.joinpath("test.fbin").stat().st_size
+            == EXTENDED_HEADER_BYTES + data.nbytes
+        )
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_load_uint64_header_with_subset(self, tmp_path):
+        """``subset_size`` works regardless of which header layout was used."""
+        data = np.random.rand(50, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, force_uint64=True)
+
+        loaded = load_vectors(path, subset_size=12)
+        assert loaded.shape == (12, 8)
+        np.testing.assert_array_equal(loaded, data[:12])
+
+    @pytest.mark.parametrize(
+        "ext, dtype, force_uint64",
+        [
+            (".fbin", np.float32, False),
+            (".fbin", np.float32, True),
+            (".ibin", np.int32, False),
+            (".ibin", np.int32, True),
+            (".u8bin", np.uint8, False),
+            (".u8bin", np.uint8, True),
+            (".i8bin", np.int8, False),
+            (".i8bin", np.int8, True),
+        ],
+    )
+    def test_load_roundtrip_all_dtypes(
+        self, tmp_path, ext, dtype, force_uint64
+    ):
+        """Round-trip every supported dtype through both header layouts."""
+        if np.issubdtype(dtype, np.integer):
+            info = np.iinfo(dtype)
+            data = np.random.randint(
+                info.min, info.max, size=(25, 7), dtype=dtype
+            )
+        else:
+            data = np.random.rand(25, 7).astype(dtype)
+        path = str(tmp_path / f"test{ext}")
+        _write_test_bin(path, data, force_uint64=force_uint64)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+
+class TestBinHeaderHelpers:
+    """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``."""
+
+    def test_write_legacy_returns_8_bytes(self, tmp_path):
+        """Small shapes should write the 8-byte uint32 header by default."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, 7, 3)
+        assert n == LEGACY_HEADER_BYTES
+        assert path.stat().st_size == LEGACY_HEADER_BYTES
+
+    def test_write_force_uint64_returns_16_bytes(self, tmp_path):
+        """``force_uint64=True`` should write the 16-byte uint64 header."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, 7, 3, force_uint64=True)
+        assert n == EXTENDED_HEADER_BYTES
+        assert path.stat().st_size == EXTENDED_HEADER_BYTES
+
+    def test_write_auto_promotes_to_uint64_when_overflowing(self, tmp_path):
+        """Shapes that don't fit in uint32 should auto-promote to uint64."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, UINT32_MAX + 1, 4)
+        assert n == EXTENDED_HEADER_BYTES
+
+    def test_write_negative_raises(self, tmp_path):
+        """Negative dimensions are rejected."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            with pytest.raises(ValueError, match="non-negative"):
+                write_bin_header(f, -1, 4)
+
+    def test_read_legacy_round_trip(self, tmp_path):
+        """Legacy round-trip: write 8-byte header, read it back."""
+        path = tmp_path / "x.fbin"
+        data = np.random.rand(11, 5).astype(np.float32)
+        with open(path, "wb") as f:
+            write_bin_header(f, data.shape[0], data.shape[1])
+            data.tofile(f)
+        n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert (n_rows, n_cols, hbytes) == (11, 5, LEGACY_HEADER_BYTES)
+
+    def test_read_extended_round_trip(self, tmp_path):
+        """Extended round-trip: write 16-byte header, read it back."""
+        path = tmp_path / "x.fbin"
+        data = np.random.rand(11, 5).astype(np.float32)
+        with open(path, "wb") as f:
+            write_bin_header(
+                f, data.shape[0], data.shape[1], force_uint64=True
+            )
+            data.tofile(f)
+        n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES)
+
+    def test_read_synthesized_huge_extended_header(self, tmp_path):
+        """A hand-crafted extended-header file with >UINT32_MAX rows reads correctly.
+
+        We can't materialize the data section (>16 GB just for the dummy
+        bytes), so write only the header and pad with the exact number of
+        zero bytes ``read_bin_header`` expects to balance the size equation
+        -- using ``n_cols=0`` so the data section is empty.
+        """
+        path = tmp_path / "huge.fbin"
+        n_rows = UINT32_MAX + 17
+        n_cols = 0
+        with open(path, "wb") as f:
+            write_bin_header(f, n_rows, n_cols)
+        assert path.stat().st_size == EXTENDED_HEADER_BYTES
+
+        got_rows, got_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert got_rows == n_rows
+        assert got_cols == n_cols
+        assert hbytes == EXTENDED_HEADER_BYTES
+
+    def test_read_file_too_small_raises(self, tmp_path):
+        """A file shorter than the legacy header raises a clear error."""
+        path = tmp_path / "x.fbin"
+        path.write_bytes(b"\x00\x00\x00")
+        with pytest.raises(ValueError, match="File too small"):
+            read_bin_header(str(path), itemsize=4)
+
+    def test_read_size_mismatch_raises(self, tmp_path):
+        """Header values that don't balance the file size are rejected."""
+        path = tmp_path / "x.fbin"
+        with open(path, "wb") as f:
+            f.write(struct.pack("<II", 10, 4))
+            f.write(b"\x00" * (5 * 4 * 4))
+        with pytest.raises(ValueError, match="does not match either"):
+            read_bin_header(str(path), itemsize=4)
+
+    def test_read_dispatch_prefers_legacy(self, tmp_path):
+        """When the legacy interpretation balances, it wins.
+
+        Guards against accidentally treating a small legacy file as
+        extended (which would silently mis-interpret the first 16 bytes
+        as two uint64s).
+        """
+        path = tmp_path / "x.fbin"
+        data = np.arange(12, dtype=np.float32).reshape(3, 4)
+        with open(path, "wb") as f:
+            write_bin_header(f, 3, 4)
+            data.tofile(f)
+        _, _, hbytes = read_bin_header(str(path), itemsize=4)
+        assert hbytes == LEGACY_HEADER_BYTES
+
 
 class TestDatasetLazyLoading:
     """Tests for Dataset transparent vector loading."""

From 1103bcb8459685dc3184e3da0b812b1edb9ebe96 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Thu, 4 Jun 2026 18:02:56 +0000
Subject: [PATCH 2/6] coderabbit reviews

---
 python/cuvs_bench/cuvs_bench/_bin_format.py   | 23 ++++++++---
 .../cuvs_bench/cuvs_bench/backends/_utils.py  |  4 +-
 .../cuvs_bench/generate_groundtruth/utils.py  | 31 +++++++-------
 .../cuvs_bench/get_dataset/fbin_to_f16bin.py  |  6 +--
 .../cuvs_bench/get_dataset/hdf5_to_fbin.py    |  6 +--
 .../cuvs_bench/cuvs_bench/tests/test_utils.py | 40 +++++++++----------
 6 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py
index 2611020fc5..8fb0d4d502 100644
--- a/python/cuvs_bench/cuvs_bench/_bin_format.py
+++ b/python/cuvs_bench/cuvs_bench/_bin_format.py
@@ -37,6 +37,8 @@
 import struct
 from typing import BinaryIO, Tuple
 
+import numpy as np
+
 UINT32_MAX = (1 << 32) - 1
 
 LEGACY_HEADER_BYTES = 8
@@ -75,6 +77,10 @@ def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
     FileNotFoundError
         If ``path`` does not exist.
     """
+    if itemsize < 1:
+        raise ValueError(
+            f"itemsize must be a positive integer, got {itemsize!r}"
+        )
     file_size = os.path.getsize(path)
     with open(path, "rb") as f:
         head = f.read(EXTENDED_HEADER_BYTES)
@@ -110,13 +116,13 @@ def write_bin_header(
     n_rows: int,
     n_cols: int,
     *,
-    force_uint64: bool = False,
+    size_dtype=np.uint32,
 ) -> int:
     """Write the canonical cuvs-bench binary header at the current position.
 
     The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and
     ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used
-    otherwise, or when explicitly requested via ``force_uint64=True``.
+    otherwise, or when explicitly requested via ``size_dtype=np.uint64``.
 
     Parameters
     ----------
@@ -124,9 +130,9 @@ def write_bin_header(
         Open binary file handle, positioned where the header should go.
     n_rows, n_cols : int
         Header values to write. Must be non-negative.
-    force_uint64 : bool
-        If ``True``, always write the 16-byte uint64 layout regardless of
-        whether the values fit in ``uint32``. Defaults to ``False``.
+    size_dtype : numpy dtype
+        ``np.uint32`` for the legacy 8-byte header (default), or
+        ``np.uint64`` to force the extended 16-byte header.
 
     Returns
     -------
@@ -137,7 +143,12 @@ def write_bin_header(
         raise ValueError(
             f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})"
         )
-    if force_uint64 or n_rows > UINT32_MAX or n_cols > UINT32_MAX:
+    use_uint64 = (
+        np.dtype(size_dtype) == np.uint64
+        or n_rows > UINT32_MAX
+        or n_cols > UINT32_MAX
+    )
+    if use_uint64:
         f.write(struct.pack("<QQ", int(n_rows), int(n_cols)))
         return EXTENDED_HEADER_BYTES
     f.write(struct.pack("<II", int(n_rows), int(n_cols)))
diff --git a/python/cuvs_bench/cuvs_bench/backends/_utils.py b/python/cuvs_bench/cuvs_bench/backends/_utils.py
index a85d57c94a..25dce65152 100644
--- a/python/cuvs_bench/cuvs_bench/backends/_utils.py
+++ b/python/cuvs_bench/cuvs_bench/backends/_utils.py
@@ -97,7 +97,9 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
     """
     dtype = dtype_from_filename(path)
     itemsize = np.dtype(dtype).itemsize
-    if subset_size is not None and subset_size < 1:
+    if subset_size is not None and (
+        isinstance(subset_size, float) or subset_size < 1
+    ):
         raise ValueError(
             f"subset_size must be a positive integer, got {subset_size}"
         )
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
index 11f8ee1762..ad6f567705 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -42,14 +42,14 @@ def suffix_from_dtype(dtype):
 
 
 def memmap_bin_file(
-    bin_file, dtype, shape=None, mode="r", *, force_uint64=False
+    bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
 ):
     """Memory-map a cuvs-bench binary file.
 
     Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and
     the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read
     mode the layout is auto-detected from the file size; in write mode the
-    legacy layout is used unless ``force_uint64=True`` or one of the shape
+    legacy layout is used unless ``size_dtype=np.uint64`` or one of the shape
     dimensions exceeds ``UINT32_MAX``.
 
     Parameters
@@ -66,9 +66,10 @@ def memmap_bin_file(
         required ``(n_rows, n_cols)`` of the file to create.
     mode : str
         Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``).
-    force_uint64 : bool
-        Write mode only: force the extended uint64 header even when the
-        shape would fit in uint32. Ignored in read mode (auto-detected).
+    size_dtype : numpy dtype
+        Write mode only: ``np.uint32`` for the legacy 8-byte header (default),
+        or ``np.uint64`` to force the extended 16-byte header. Ignored in read
+        mode (auto-detected).
     """
     if bin_file is None:
         return None
@@ -76,6 +77,11 @@ def memmap_bin_file(
         dtype = dtype_from_filename(bin_file)
     itemsize = np.dtype(dtype).itemsize
 
+    if shape is not None and len(shape) != 2:
+        raise ValueError(
+            f"shape must have exactly 2 dimensions (n_rows, n_cols), got {shape!r}"
+        )
+
     if mode[0] == "r":
         n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize)
         if shape is None:
@@ -103,7 +109,7 @@ def memmap_bin_file(
             os.makedirs(dirname, exist_ok=True)
         with open(bin_file, "wb") as f:
             header_bytes = write_bin_header(
-                f, shape[0], shape[1], force_uint64=force_uint64
+                f, shape[0], shape[1], size_dtype=size_dtype
             )
         return np.memmap(
             bin_file,
@@ -114,16 +120,9 @@ def memmap_bin_file(
         )
 
 
-def write_bin(fname, data, *, force_uint64=False):
-    """Write a 2-D numpy array to a cuvs-bench binary file.
-
-    The legacy 8-byte uint32 header is used by default; pass
-    ``force_uint64=True`` (or supply a shape with a dimension exceeding
-    ``UINT32_MAX``) to write the extended 16-byte uint64 header instead.
-    """
+def write_bin(fname, data):
+    """Write a 2-D numpy array to a cuvs-bench binary file."""
     print("writing", fname, data.shape, data.dtype, "...")
     with open(fname, "wb") as f:
-        write_bin_header(
-            f, data.shape[0], data.shape[1], force_uint64=force_uint64
-        )
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
index 19c14e8aa8..0fba915cae 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
@@ -27,11 +27,9 @@ def read_fbin(fname):
     return data
 
 
-def write_bin(fname, data, *, force_uint64=False):
+def write_bin(fname, data):
     with open(fname, "wb") as f:
-        write_bin_header(
-            f, data.shape[0], data.shape[1], force_uint64=force_uint64
-        )
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
index 34abacbf6d..2998a5dcb2 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
@@ -16,11 +16,9 @@ def normalize(x):
     return (x.T / norm).T
 
 
-def write_bin(fname, data, *, force_uint64=False):
+def write_bin(fname, data):
     with open(fname, "wb") as f:
-        write_bin_header(
-            f, data.shape[0], data.shape[1], force_uint64=force_uint64
-        )
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index a009684723..d5a3482385 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -29,11 +29,11 @@
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
 
-def _write_test_bin(path, data, *, force_uint64=False):
+def _write_test_bin(path, data, *, size_dtype=np.uint32):
     """Write a numpy array in cuvs-bench binary format."""
     with open(path, "wb") as f:
         write_bin_header(
-            f, data.shape[0], data.shape[1], force_uint64=force_uint64
+            f, data.shape[0], data.shape[1], size_dtype=size_dtype
         )
         data.tofile(f)
 
@@ -193,7 +193,7 @@ def test_load_uint64_header(self, tmp_path):
         """``load_vectors`` reads files written with the extended uint64 header."""
         data = np.random.rand(40, 16).astype(np.float32)
         path = str(tmp_path / "test.fbin")
-        _write_test_bin(path, data, force_uint64=True)
+        _write_test_bin(path, data, size_dtype=np.uint64)
 
         # Sanity check: file really uses the extended layout.
         assert (
@@ -207,28 +207,26 @@ def test_load_uint64_header_with_subset(self, tmp_path):
         """``subset_size`` works regardless of which header layout was used."""
         data = np.random.rand(50, 8).astype(np.float32)
         path = str(tmp_path / "test.fbin")
-        _write_test_bin(path, data, force_uint64=True)
+        _write_test_bin(path, data, size_dtype=np.uint64)
 
         loaded = load_vectors(path, subset_size=12)
         assert loaded.shape == (12, 8)
         np.testing.assert_array_equal(loaded, data[:12])
 
     @pytest.mark.parametrize(
-        "ext, dtype, force_uint64",
+        "ext, dtype, size_dtype",
         [
-            (".fbin", np.float32, False),
-            (".fbin", np.float32, True),
-            (".ibin", np.int32, False),
-            (".ibin", np.int32, True),
-            (".u8bin", np.uint8, False),
-            (".u8bin", np.uint8, True),
-            (".i8bin", np.int8, False),
-            (".i8bin", np.int8, True),
+            (".fbin", np.float32, np.uint32),
+            (".fbin", np.float32, np.uint64),
+            (".ibin", np.int32, np.uint32),
+            (".ibin", np.int32, np.uint64),
+            (".u8bin", np.uint8, np.uint32),
+            (".u8bin", np.uint8, np.uint64),
+            (".i8bin", np.int8, np.uint32),
+            (".i8bin", np.int8, np.uint64),
         ],
     )
-    def test_load_roundtrip_all_dtypes(
-        self, tmp_path, ext, dtype, force_uint64
-    ):
+    def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype):
         """Round-trip every supported dtype through both header layouts."""
         if np.issubdtype(dtype, np.integer):
             info = np.iinfo(dtype)
@@ -238,7 +236,7 @@ def test_load_roundtrip_all_dtypes(
         else:
             data = np.random.rand(25, 7).astype(dtype)
         path = str(tmp_path / f"test{ext}")
-        _write_test_bin(path, data, force_uint64=force_uint64)
+        _write_test_bin(path, data, size_dtype=size_dtype)
 
         loaded = load_vectors(path)
         np.testing.assert_array_equal(loaded, data)
@@ -255,11 +253,11 @@ def test_write_legacy_returns_8_bytes(self, tmp_path):
         assert n == LEGACY_HEADER_BYTES
         assert path.stat().st_size == LEGACY_HEADER_BYTES
 
-    def test_write_force_uint64_returns_16_bytes(self, tmp_path):
-        """``force_uint64=True`` should write the 16-byte uint64 header."""
+    def test_write_size_dtype_uint64_returns_16_bytes(self, tmp_path):
+        """``size_dtype=np.uint64`` should write the 16-byte uint64 header."""
         path = tmp_path / "h.bin"
         with open(path, "wb") as f:
-            n = write_bin_header(f, 7, 3, force_uint64=True)
+            n = write_bin_header(f, 7, 3, size_dtype=np.uint64)
         assert n == EXTENDED_HEADER_BYTES
         assert path.stat().st_size == EXTENDED_HEADER_BYTES
 
@@ -293,7 +291,7 @@ def test_read_extended_round_trip(self, tmp_path):
         data = np.random.rand(11, 5).astype(np.float32)
         with open(path, "wb") as f:
             write_bin_header(
-                f, data.shape[0], data.shape[1], force_uint64=True
+                f, data.shape[0], data.shape[1], size_dtype=np.uint64
             )
             data.tofile(f)
         n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4)

From 8240e5209699a09ac8df93d9c65e0f501cd213d4 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Thu, 4 Jun 2026 21:02:04 +0000
Subject: [PATCH 3/6] simplify comment

---
 python/cuvs_bench/cuvs_bench/_bin_format.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py
index 8fb0d4d502..e0c973524a 100644
--- a/python/cuvs_bench/cuvs_bench/_bin_format.py
+++ b/python/cuvs_bench/cuvs_bench/_bin_format.py
@@ -59,16 +59,13 @@ def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
         Path to the binary file.
     itemsize : int
         Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for
-        ``int8``). Used purely for the size-equation check; the file
-        contents are not inspected.
+        ``int8``) used for the size-equation check.
 
     Returns
     -------
     (n_rows, n_cols, header_bytes) : Tuple[int, int, int]
         Row count, column count, and the number of bytes the header
-        occupies on disk (``8`` for legacy, ``16`` for extended). Callers
-        seeking to the data start should use ``header_bytes`` rather than
-        a hardcoded offset.
+        occupies on disk (``8`` for legacy, ``16`` for extended).
 
     Raises
     ------

From 751d0f3af1f102cf47ab48c3a54c15d64a64ed69 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 5 Jun 2026 00:37:31 +0000
Subject: [PATCH 4/6] fix/add tests

---
 .../cuvs_bench/cuvs_bench/tests/test_utils.py | 92 +++++++++++++++++--
 1 file changed, 84 insertions(+), 8 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index d5a3482385..8c6587c2f2 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -26,6 +26,7 @@
     expand_param_grid,
     load_vectors,
 )
+from cuvs_bench.generate_groundtruth.utils import memmap_bin_file
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
 
@@ -218,6 +219,8 @@ def test_load_uint64_header_with_subset(self, tmp_path):
         [
             (".fbin", np.float32, np.uint32),
             (".fbin", np.float32, np.uint64),
+            (".f16bin", np.float16, np.uint32),
+            (".f16bin", np.float16, np.uint64),
             (".ibin", np.int32, np.uint32),
             (".ibin", np.int32, np.uint64),
             (".u8bin", np.uint8, np.uint32),
@@ -298,21 +301,24 @@ def test_read_extended_round_trip(self, tmp_path):
         assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES)
 
     def test_read_synthesized_huge_extended_header(self, tmp_path):
-        """A hand-crafted extended-header file with >UINT32_MAX rows reads correctly.
+        """Extended-header file with >UINT32_MAX rows and positive n_cols.
 
-        We can't materialize the data section (>16 GB just for the dummy
-        bytes), so write only the header and pad with the exact number of
-        zero bytes ``read_bin_header`` expects to balance the size equation
-        -- using ``n_cols=0`` so the data section is empty.
+        We can't materialize the full data section, so write the header and
+        truncate to the exact file size ``read_bin_header`` expects.
         """
         path = tmp_path / "huge.fbin"
         n_rows = UINT32_MAX + 17
-        n_cols = 0
+        n_cols = 4
+        itemsize = 4
+        expected_size = EXTENDED_HEADER_BYTES + n_rows * n_cols * itemsize
         with open(path, "wb") as f:
             write_bin_header(f, n_rows, n_cols)
-        assert path.stat().st_size == EXTENDED_HEADER_BYTES
+            f.truncate(expected_size)
 
-        got_rows, got_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert path.stat().st_size == expected_size
+        got_rows, got_cols, hbytes = read_bin_header(
+            str(path), itemsize=itemsize
+        )
         assert got_rows == n_rows
         assert got_cols == n_cols
         assert hbytes == EXTENDED_HEADER_BYTES
@@ -349,6 +355,76 @@ def test_read_dispatch_prefers_legacy(self, tmp_path):
         assert hbytes == LEGACY_HEADER_BYTES
 
 
+class TestMemmapBinFile:
+    """Tests for ``generate_groundtruth.utils.memmap_bin_file``."""
+
+    def test_read_legacy_header(self, tmp_path):
+        """Read mode auto-detects the legacy 8-byte header offset."""
+        data = np.random.rand(30, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        mm = memmap_bin_file(path, np.float32, mode="r")
+        assert mm.shape == (30, 8)
+        np.testing.assert_array_equal(mm[:], data)
+
+    def test_read_extended_header(self, tmp_path):
+        """Read mode auto-detects the extended 16-byte header offset."""
+        data = np.random.rand(30, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, size_dtype=np.uint64)
+
+        mm = memmap_bin_file(path, np.float32, mode="r")
+        assert mm.shape == (30, 8)
+        np.testing.assert_array_equal(mm[:], data)
+
+    def test_read_partial_shape_override(self, tmp_path):
+        """Read mode fills ``None`` shape entries from the header."""
+        data = np.random.rand(50, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        mm = memmap_bin_file(path, np.float32, shape=(10, None), mode="r")
+        assert mm.shape == (10, 8)
+        np.testing.assert_array_equal(mm[:], data[:10])
+
+    def test_write_read_roundtrip_legacy(self, tmp_path):
+        """Write mode with uint32 header, then read back via memmap."""
+        path = str(tmp_path / "test.fbin")
+        shape = (20, 8)
+        data = np.random.rand(*shape).astype(np.float32)
+
+        mm = memmap_bin_file(path, np.float32, shape=shape, mode="w+")
+        mm[:] = data
+        mm.flush()
+        del mm
+
+        loaded = memmap_bin_file(path, np.float32, mode="r")
+        assert loaded.shape == shape
+        np.testing.assert_array_equal(loaded[:], data)
+
+    def test_write_read_roundtrip_extended(self, tmp_path):
+        """Write mode with uint64 header, then read back via memmap."""
+        path = str(tmp_path / "test.fbin")
+        shape = (20, 8)
+        data = np.random.rand(*shape).astype(np.float32)
+
+        mm = memmap_bin_file(
+            path, np.float32, shape=shape, mode="w+", size_dtype=np.uint64
+        )
+        mm[:] = data
+        mm.flush()
+        del mm
+
+        loaded = memmap_bin_file(path, np.float32, mode="r")
+        assert loaded.shape == shape
+        np.testing.assert_array_equal(loaded[:], data)
+        assert (
+            tmp_path.joinpath("test.fbin").stat().st_size
+            == EXTENDED_HEADER_BYTES + data.nbytes
+        )
+
+
 class TestDatasetLazyLoading:
     """Tests for Dataset transparent vector loading."""
 

From 9a5b9e0e8ded16557e0eeb93d58652e3046b4691 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 5 Jun 2026 01:25:01 +0000
Subject: [PATCH 5/6] gt uint64 support

---
 python/cuvs_bench/cuvs_bench/_bin_format.py   |  2 +-
 .../cuvs_bench/cuvs_bench/backends/_utils.py  |  4 +-
 .../generate_groundtruth/__main__.py          | 14 ++++--
 .../cuvs_bench/generate_groundtruth/utils.py  | 18 ++++++++
 .../cuvs_bench/cuvs_bench/tests/test_utils.py | 46 ++++++++++++++++++-
 5 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py
index e0c973524a..f92422474c 100644
--- a/python/cuvs_bench/cuvs_bench/_bin_format.py
+++ b/python/cuvs_bench/cuvs_bench/_bin_format.py
@@ -12,7 +12,7 @@
 
 - **Legacy**:  ``[uint32 n_rows, uint32 n_cols, data ...]``  (8-byte header).
   This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin``
-  / ``.f16bin`` / ``.hbin`` file on disk uses today.
+  / ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today.
 
 - **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]``  (16-byte header).
   For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B).
diff --git a/python/cuvs_bench/cuvs_bench/backends/_utils.py b/python/cuvs_bench/cuvs_bench/backends/_utils.py
index 25dce65152..250e2d2f50 100644
--- a/python/cuvs_bench/cuvs_bench/backends/_utils.py
+++ b/python/cuvs_bench/cuvs_bench/backends/_utils.py
@@ -55,6 +55,8 @@ def dtype_from_filename(filename):
         return np.float16
     elif ext == ".ibin":
         return np.int32
+    elif ext == ".u64bin":
+        return np.uint64
     elif ext == ".u8bin":
         return np.ubyte
     elif ext == ".i8bin":
@@ -78,7 +80,7 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
     path : str
         Path to the binary file. The dtype is inferred from the extension:
         ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
-        ``.i8bin`` (int8), ``.ibin`` (int32).
+        ``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64).
     subset_size : Optional[int]
         If provided, only the first ``subset_size`` rows are loaded.
 
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index 43c03f4322..9baa9e2a37 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -9,7 +9,13 @@
 import sys
 import warnings
 
-from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+from .utils import (
+    groundtruth_neighbors_filename,
+    memmap_bin_file,
+    neighbor_index_dtype,
+    suffix_from_dtype,
+    write_bin,
+)
 
 
 def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
@@ -358,9 +364,11 @@ def main():
     print("Calculating true nearest neighbors")
     distances, indices = calc_truth(dataset, queries, args.k, args.metric)
 
+    n_base = dataset.shape[0]
+    neighbor_dtype = neighbor_index_dtype(n_base)
     write_bin(
-        os.path.join(args.output, "groundtruth.neighbors.ibin"),
-        indices.astype(xp.uint32),
+        os.path.join(args.output, groundtruth_neighbors_filename(n_base)),
+        xp.asarray(indices, dtype=neighbor_dtype),
     )
     write_bin(
         os.path.join(args.output, "groundtruth.distances.fbin"),
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
index ad6f567705..0159211520 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -18,6 +18,8 @@ def dtype_from_filename(filename):
         return np.float16
     elif ext == ".ibin":
         return np.int32
+    elif ext == ".u64bin":
+        return np.uint64
     elif ext == ".u8bin":
         return np.ubyte
     elif ext == ".i8bin":
@@ -33,6 +35,8 @@ def suffix_from_dtype(dtype):
         return ".hbin"
     elif dtype == np.int32:
         return ".ibin"
+    elif dtype == np.uint64:
+        return ".u64bin"
     elif dtype == np.ubyte:
         return ".u8bin"
     elif dtype == np.byte:
@@ -41,6 +45,20 @@ def suffix_from_dtype(dtype):
         raise RuntimeError("Not supported dtype extension" + dtype)
 
 
+def neighbor_index_dtype(n_base: int) -> np.dtype:
+    """Return the dtype used to store neighbor row IDs for a base set size."""
+    if n_base > np.iinfo(np.int32).max:
+        return np.uint64
+    return np.int32
+
+
+def groundtruth_neighbors_filename(n_base: int) -> str:
+    """Return the ground-truth neighbors filename for a base set size."""
+    if n_base > np.iinfo(np.int32).max:
+        return "groundtruth.neighbors.u64bin"
+    return "groundtruth.neighbors.ibin"
+
+
 def memmap_bin_file(
     bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
 ):
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index 8c6587c2f2..6fffe59e5a 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -26,7 +26,11 @@
     expand_param_grid,
     load_vectors,
 )
-from cuvs_bench.generate_groundtruth.utils import memmap_bin_file
+from cuvs_bench.generate_groundtruth.utils import (
+    groundtruth_neighbors_filename,
+    memmap_bin_file,
+    neighbor_index_dtype,
+)
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
 
@@ -54,6 +58,10 @@ def test_ibin(self):
         """Test .ibin maps to int32."""
         assert dtype_from_filename("groundtruth.ibin") == np.int32
 
+    def test_u64bin(self):
+        """Test .u64bin maps to uint64."""
+        assert dtype_from_filename("groundtruth.neighbors.u64bin") == np.uint64
+
     def test_u8bin(self):
         """Test .u8bin maps to uint8."""
         assert dtype_from_filename("vectors.u8bin") == np.ubyte
@@ -223,6 +231,8 @@ def test_load_uint64_header_with_subset(self, tmp_path):
             (".f16bin", np.float16, np.uint64),
             (".ibin", np.int32, np.uint32),
             (".ibin", np.int32, np.uint64),
+            (".u64bin", np.uint64, np.uint32),
+            (".u64bin", np.uint64, np.uint64),
             (".u8bin", np.uint8, np.uint32),
             (".u8bin", np.uint8, np.uint64),
             (".i8bin", np.int8, np.uint32),
@@ -236,6 +246,8 @@ def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype):
             data = np.random.randint(
                 info.min, info.max, size=(25, 7), dtype=dtype
             )
+            if dtype == np.uint64:
+                data[0, 0] = np.iinfo(np.int32).max + 42
         else:
             data = np.random.rand(25, 7).astype(dtype)
         path = str(tmp_path / f"test{ext}")
@@ -245,6 +257,38 @@ def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype):
         np.testing.assert_array_equal(loaded, data)
 
 
+class TestGroundtruthNeighborFormat:
+    """Tests for large-base ground-truth neighbor index format selection."""
+
+    def test_neighbor_index_dtype_small_base(self):
+        assert neighbor_index_dtype(1_000_000) == np.int32
+
+    def test_neighbor_index_dtype_large_base(self):
+        assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64
+
+    def test_groundtruth_neighbors_filename_small_base(self):
+        assert (
+            groundtruth_neighbors_filename(1_000_000)
+            == "groundtruth.neighbors.ibin"
+        )
+
+    def test_groundtruth_neighbors_filename_large_base(self):
+        assert (
+            groundtruth_neighbors_filename(np.iinfo(np.int32).max + 1)
+            == "groundtruth.neighbors.u64bin"
+        )
+
+    def test_load_u64bin_preserves_large_indices(self, tmp_path):
+        """uint64 GT files preserve neighbor IDs above INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 12345
+        indices = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        path = str(tmp_path / "gt.u64bin")
+        _write_test_bin(path, indices)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, indices)
+
+
 class TestBinHeaderHelpers:
     """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``."""
 

From 44856bcc8b8acdf0808c097a17fb8a8260b6fcd1 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 5 Jun 2026 01:58:01 +0000
Subject: [PATCH 6/6] shift neighbor index support large dtype

---
 .../generate_groundtruth/__main__.py          | 11 ++--
 .../cuvs_bench/generate_groundtruth/utils.py  | 25 +++++++++
 .../cuvs_bench/cuvs_bench/tests/test_utils.py | 56 +++++++++++++++++++
 3 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index 9baa9e2a37..4315c8e3ac 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -12,9 +12,10 @@
 from .utils import (
     groundtruth_neighbors_filename,
     memmap_bin_file,
-    neighbor_index_dtype,
+    offset_neighbor_indices,
     suffix_from_dtype,
     write_bin,
+    write_groundtruth_neighbors,
 )
 
 
@@ -199,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
             D, Ind = cpu_search(X, queries, k, metric=metric)
 
         D, Ind = xp.asarray(D), xp.asarray(Ind)
-        Ind += i  # shift neighbor index by offset i
+        Ind = offset_neighbor_indices(Ind, i, n_samples)
 
         if distances is None:
             distances = D
@@ -365,10 +366,10 @@ def main():
     distances, indices = calc_truth(dataset, queries, args.k, args.metric)
 
     n_base = dataset.shape[0]
-    neighbor_dtype = neighbor_index_dtype(n_base)
-    write_bin(
+    write_groundtruth_neighbors(
         os.path.join(args.output, groundtruth_neighbors_filename(n_base)),
-        xp.asarray(indices, dtype=neighbor_dtype),
+        indices,
+        n_base,
     )
     write_bin(
         os.path.join(args.output, "groundtruth.distances.fbin"),
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
index 0159211520..d3ee4b3479 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -52,6 +52,18 @@ def neighbor_index_dtype(n_base: int) -> np.dtype:
     return np.int32
 
 
+def neighbor_index_accumulator_dtype(n_base: int) -> np.dtype:
+    """Return the in-memory dtype for neighbor IDs during GT computation.
+
+    cuVS brute-force search returns ``int64`` neighbors. Use ``int64`` for
+    large bases so batch offsets up to multi-billion row counts do not
+    overflow; cast to :func:`neighbor_index_dtype` only when writing files.
+    """
+    if n_base > np.iinfo(np.int32).max:
+        return np.int64
+    return np.int32
+
+
 def groundtruth_neighbors_filename(n_base: int) -> str:
     """Return the ground-truth neighbors filename for a base set size."""
     if n_base > np.iinfo(np.int32).max:
@@ -59,6 +71,19 @@ def groundtruth_neighbors_filename(n_base: int) -> str:
     return "groundtruth.neighbors.ibin"
 
 
+def offset_neighbor_indices(indices, batch_offset: int, n_base: int):
+    """Shift local neighbor IDs by a batch offset without integer overflow."""
+    dtype = neighbor_index_accumulator_dtype(n_base)
+    return indices.astype(dtype) + batch_offset
+
+
+def write_groundtruth_neighbors(path, indices, n_base: int):
+    """Write a ground-truth neighbor matrix using the correct on-disk dtype."""
+    storage_dtype = neighbor_index_dtype(n_base)
+    data = np.asarray(indices, dtype=storage_dtype)
+    write_bin(path, data)
+
+
 def memmap_bin_file(
     bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
 ):
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index 6fffe59e5a..01dd803a07 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -29,7 +29,10 @@
 from cuvs_bench.generate_groundtruth.utils import (
     groundtruth_neighbors_filename,
     memmap_bin_file,
+    neighbor_index_accumulator_dtype,
     neighbor_index_dtype,
+    offset_neighbor_indices,
+    write_groundtruth_neighbors,
 )
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
@@ -266,6 +269,12 @@ def test_neighbor_index_dtype_small_base(self):
     def test_neighbor_index_dtype_large_base(self):
         assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64
 
+    def test_neighbor_index_accumulator_dtype_large_base(self):
+        assert (
+            neighbor_index_accumulator_dtype(np.iinfo(np.int32).max + 1)
+            == np.int64
+        )
+
     def test_groundtruth_neighbors_filename_small_base(self):
         assert (
             groundtruth_neighbors_filename(1_000_000)
@@ -288,6 +297,46 @@ def test_load_u64bin_preserves_large_indices(self, tmp_path):
         loaded = load_vectors(path)
         np.testing.assert_array_equal(loaded, indices)
 
+    def test_offset_neighbor_indices_small_base(self):
+        local = np.array([[0, 1, 2]], dtype=np.uint32)
+        offset = offset_neighbor_indices(local, 1000, 1_000_000)
+        assert offset.dtype == np.int32
+        np.testing.assert_array_equal(offset, [[1000, 1001, 1002]])
+
+    def test_offset_neighbor_indices_large_batch_offset(self):
+        """Search-local IDs must not wrap when batch offset exceeds INT32_MAX."""
+        batch_offset = np.iinfo(np.int32).max + 1
+        n_base = batch_offset + 10
+        local = np.array([[0, 1, 2]], dtype=np.int64)
+        offset = offset_neighbor_indices(local, batch_offset, n_base)
+        assert offset.dtype == np.int64
+        np.testing.assert_array_equal(
+            offset,
+            [[batch_offset, batch_offset + 1, batch_offset + 2]],
+        )
+
+    def test_write_groundtruth_neighbors_round_trip(self, tmp_path):
+        """GT write/load preserves neighbor IDs above INT32_MAX."""
+        n_base = np.iinfo(np.int32).max + 1
+        large_id = n_base + 999
+        indices = np.array([[large_id, large_id - 1, 0]], dtype=np.int64)
+        path = str(tmp_path / groundtruth_neighbors_filename(n_base))
+        write_groundtruth_neighbors(path, indices, n_base)
+
+        loaded = load_vectors(path)
+        assert loaded.dtype == np.uint64
+        np.testing.assert_array_equal(loaded, indices.astype(np.uint64))
+
+    def test_dataset_lazy_load_u64bin_groundtruth(self, tmp_path):
+        """Dataset loads .u64bin ground truth with IDs above INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 12345
+        gt = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        path = str(tmp_path / "groundtruth.neighbors.u64bin")
+        _write_test_bin(path, gt)
+
+        dataset = Dataset(name="test", groundtruth_neighbors_file=path)
+        np.testing.assert_array_equal(dataset.groundtruth_neighbors, gt)
+
 
 class TestBinHeaderHelpers:
     """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``."""
@@ -720,3 +769,10 @@ def test_multiple_queries(self):
         groundtruth = np.array([[0, 1, 2], [3, 4, 5]])
         recall = compute_recall(neighbors, groundtruth, k=3)
         assert abs(recall - 5.0 / 6.0) < 1e-9
+
+    def test_large_uint64_neighbor_ids(self):
+        """Recall works when GT neighbor IDs exceed INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 999
+        neighbors = np.array([[large_id, 0, 1]], dtype=np.int64)
+        groundtruth = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        assert compute_recall(neighbors, groundtruth, k=3) == 1.0