diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py
new file mode 100644
index 0000000000..f92422474c
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/_bin_format.py
@@ -0,0 +1,152 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+"""
+On-disk header helpers for the cuvs-bench binary file format.
+
+cuvs-bench inherits the big-ann-benchmarks binary layout: a small header
+listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols``
+array of the dtype implied by the file extension. Two layouts are supported:
+
+- **Legacy**:  ``[uint32 n_rows, uint32 n_cols, data ...]``  (8-byte header).
+  This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin``
+  / ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today.
+
+- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]``  (16-byte header).
+  For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B).
+
+Detection is **size-based**: a well-formed cuvs-bench binary is exactly
+``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes
+of the file and:
+
+1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte
+   header). The layout is accepted if ``8 + n_rows * n_cols * itemsize``
+   matches the on-disk file size.
+2. Otherwise tries the extended layout (first 16 bytes as two
+   ``uint64``s, 16-byte header). Accepted if
+   ``16 + n_rows * n_cols * itemsize`` matches the file size instead.
+3. If neither layout matches, raises ``ValueError`` -- the file is
+   truncated, padded, or has a mismatched dtype extension.
+"""
+
+from __future__ import annotations
+
+import os
+import struct
+from typing import BinaryIO, Tuple
+
+import numpy as np
+
+UINT32_MAX = (1 << 32) - 1
+
+LEGACY_HEADER_BYTES = 8
+EXTENDED_HEADER_BYTES = 16
+
+
+def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
+    """Read the header of a cuvs-bench binary file.
+
+    Auto-detects the on-disk layout from the file size by checking which
+    of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64
+    header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize``
+    balance.
+
+    Parameters
+    ----------
+    path : str
+        Path to the binary file.
+    itemsize : int
+        Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for
+        ``int8``) used for the size-equation check.
+
+    Returns
+    -------
+    (n_rows, n_cols, header_bytes) : Tuple[int, int, int]
+        Row count, column count, and the number of bytes the header
+        occupies on disk (``8`` for legacy, ``16`` for extended).
+
+    Raises
+    ------
+    ValueError
+        If neither the legacy nor the extended interpretation matches.
+    FileNotFoundError
+        If ``path`` does not exist.
+    """
+    if itemsize < 1:
+        raise ValueError(
+            f"itemsize must be a positive integer, got {itemsize!r}"
+        )
+    file_size = os.path.getsize(path)
+    with open(path, "rb") as f:
+        head = f.read(EXTENDED_HEADER_BYTES)
+
+    if len(head) < LEGACY_HEADER_BYTES:
+        raise ValueError(
+            f"File too small to contain a valid header (expected at least "
+            f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}"
+        )
+
+    n_rows_32, n_cols_32 = struct.unpack("<II", head[:LEGACY_HEADER_BYTES])
+    if file_size == LEGACY_HEADER_BYTES + n_rows_32 * n_cols_32 * itemsize:
+        return int(n_rows_32), int(n_cols_32), LEGACY_HEADER_BYTES
+
+    if len(head) == EXTENDED_HEADER_BYTES:
+        n_rows_64, n_cols_64 = struct.unpack("<QQ", head)
+        if (
+            file_size
+            == EXTENDED_HEADER_BYTES + n_rows_64 * n_cols_64 * itemsize
+        ):
+            return int(n_rows_64), int(n_cols_64), EXTENDED_HEADER_BYTES
+
+    raise ValueError(
+        f"File size {file_size:,} bytes does not match either the legacy "
+        f"(8-byte uint32) or extended (16-byte uint64) header layout for "
+        f"itemsize={itemsize}: {path}. The file may be truncated, padded, "
+        f"or have a mismatched dtype extension."
+    )
+
+
+def write_bin_header(
+    f: BinaryIO,
+    n_rows: int,
+    n_cols: int,
+    *,
+    size_dtype=np.uint32,
+) -> int:
+    """Write the canonical cuvs-bench binary header at the current position.
+
+    The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and
+    ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used
+    otherwise, or when explicitly requested via ``size_dtype=np.uint64``.
+
+    Parameters
+    ----------
+    f : BinaryIO
+        Open binary file handle, positioned where the header should go.
+    n_rows, n_cols : int
+        Header values to write. Must be non-negative.
+    size_dtype : numpy dtype
+        ``np.uint32`` for the legacy 8-byte header (default), or
+        ``np.uint64`` to force the extended 16-byte header.
+
+    Returns
+    -------
+    int
+        Number of bytes written (``8`` for legacy, ``16`` for extended).
+    """
+    if n_rows < 0 or n_cols < 0:
+        raise ValueError(
+            f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})"
+        )
+    use_uint64 = (
+        np.dtype(size_dtype) == np.uint64
+        or n_rows > UINT32_MAX
+        or n_cols > UINT32_MAX
+    )
+    if use_uint64:
+        f.write(struct.pack("<QQ", int(n_rows), int(n_cols)))
+        return EXTENDED_HEADER_BYTES
+    f.write(struct.pack("<II", int(n_rows), int(n_cols)))
+    return LEGACY_HEADER_BYTES
diff --git a/python/cuvs_bench/cuvs_bench/backends/_utils.py b/python/cuvs_bench/cuvs_bench/backends/_utils.py
index b931f3128b..250e2d2f50 100644
--- a/python/cuvs_bench/cuvs_bench/backends/_utils.py
+++ b/python/cuvs_bench/cuvs_bench/backends/_utils.py
@@ -27,6 +27,8 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header
+
 
 def dtype_from_filename(filename):
     """Map file extension to numpy dtype.
@@ -53,6 +55,8 @@ def dtype_from_filename(filename):
         return np.float16
     elif ext == ".ibin":
         return np.int32
+    elif ext == ".u64bin":
+        return np.uint64
     elif ext == ".u8bin":
         return np.ubyte
     elif ext == ".i8bin":
@@ -65,17 +69,18 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
     """
     Read a binary vector file into a numpy array.
 
-    Supports the standard big-ann-bench binary format used by cuvs-bench
-    datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
-    followed by ``n_rows * n_cols`` elements of the dtype inferred from
-    the file extension via ``dtype_from_filename``.
+    Supports the cuvs-bench binary format with either the legacy 8-byte
+    ``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte
+    ``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more
+    than ``UINT32_MAX`` rows or columns. The layout is auto-detected from
+    the file size by :func:`cuvs_bench._bin_format.read_bin_header`.
 
     Parameters
     ----------
     path : str
         Path to the binary file. The dtype is inferred from the extension:
         ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
-        ``.i8bin`` (int8), ``.ibin`` (int32).
+        ``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64).
     subset_size : Optional[int]
         If provided, only the first ``subset_size`` rows are loaded.
 
@@ -93,27 +98,24 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
         or the file is truncated.
     """
     dtype = dtype_from_filename(path)
-    if subset_size is not None and subset_size < 1:
+    itemsize = np.dtype(dtype).itemsize
+    if subset_size is not None and (
+        isinstance(subset_size, float) or subset_size < 1
+    ):
         raise ValueError(
             f"subset_size must be a positive integer, got {subset_size}"
         )
+    n_rows, n_cols, header_bytes = read_bin_header(path, itemsize)
+    if subset_size is not None:
+        n_rows = min(n_rows, subset_size)
     with open(path, "rb") as f:
-        header = f.read(8)
-        if len(header) < 8:
-            raise ValueError(
-                f"File too small to contain a valid header (expected 8 bytes, "
-                f"got {len(header)}): {path}"
-            )
-        n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
-        n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
-        if subset_size is not None:
-            n_rows = min(n_rows, subset_size)
-        expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
+        f.seek(header_bytes)
+        expected_bytes = n_rows * n_cols * itemsize
         raw = f.read(expected_bytes)
         if len(raw) < expected_bytes:
             raise ValueError(
                 f"File is truncated: expected {expected_bytes} bytes of data "
-                f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
+                f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), "
                 f"got {len(raw)}: {path}"
             )
         data = np.frombuffer(raw, dtype=dtype)
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index 43c03f4322..4315c8e3ac 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -9,7 +9,14 @@
 import sys
 import warnings
 
-from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+from .utils import (
+    groundtruth_neighbors_filename,
+    memmap_bin_file,
+    offset_neighbor_indices,
+    suffix_from_dtype,
+    write_bin,
+    write_groundtruth_neighbors,
+)
 
 
 def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
@@ -193,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
             D, Ind = cpu_search(X, queries, k, metric=metric)
 
         D, Ind = xp.asarray(D), xp.asarray(Ind)
-        Ind += i  # shift neighbor index by offset i
+        Ind = offset_neighbor_indices(Ind, i, n_samples)
 
         if distances is None:
             distances = D
@@ -358,9 +365,11 @@ def main():
     print("Calculating true nearest neighbors")
     distances, indices = calc_truth(dataset, queries, args.k, args.metric)
 
-    write_bin(
-        os.path.join(args.output, "groundtruth.neighbors.ibin"),
-        indices.astype(xp.uint32),
+    n_base = dataset.shape[0]
+    write_groundtruth_neighbors(
+        os.path.join(args.output, groundtruth_neighbors_filename(n_base)),
+        indices,
+        n_base,
     )
     write_bin(
         os.path.join(args.output, "groundtruth.distances.fbin"),
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
index 72fb5b4a07..d3ee4b3479 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 
@@ -7,6 +7,8 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header, write_bin_header
+
 
 def dtype_from_filename(filename):
     ext = os.path.splitext(filename)[1]
@@ -16,6 +18,8 @@ def dtype_from_filename(filename):
         return np.float16
     elif ext == ".ibin":
         return np.int32
+    elif ext == ".u64bin":
+        return np.uint64
     elif ext == ".u8bin":
         return np.ubyte
     elif ext == ".i8bin":
@@ -31,6 +35,8 @@ def suffix_from_dtype(dtype):
         return ".hbin"
     elif dtype == np.int32:
         return ".ibin"
+    elif dtype == np.uint64:
+        return ".u64bin"
     elif dtype == np.ubyte:
         return ".u8bin"
     elif dtype == np.byte:
@@ -39,30 +45,102 @@ def suffix_from_dtype(dtype):
         raise RuntimeError("Not supported dtype extension" + dtype)
 
 
+def neighbor_index_dtype(n_base: int) -> np.dtype:
+    """Return the dtype used to store neighbor row IDs for a base set size."""
+    if n_base > np.iinfo(np.int32).max:
+        return np.uint64
+    return np.int32
+
+
+def neighbor_index_accumulator_dtype(n_base: int) -> np.dtype:
+    """Return the in-memory dtype for neighbor IDs during GT computation.
+
+    cuVS brute-force search returns ``int64`` neighbors. Use ``int64`` for
+    large bases so batch offsets up to multi-billion row counts do not
+    overflow; cast to :func:`neighbor_index_dtype` only when writing files.
+    """
+    if n_base > np.iinfo(np.int32).max:
+        return np.int64
+    return np.int32
+
+
+def groundtruth_neighbors_filename(n_base: int) -> str:
+    """Return the ground-truth neighbors filename for a base set size."""
+    if n_base > np.iinfo(np.int32).max:
+        return "groundtruth.neighbors.u64bin"
+    return "groundtruth.neighbors.ibin"
+
+
+def offset_neighbor_indices(indices, batch_offset: int, n_base: int):
+    """Shift local neighbor IDs by a batch offset without integer overflow."""
+    dtype = neighbor_index_accumulator_dtype(n_base)
+    return indices.astype(dtype) + batch_offset
+
+
+def write_groundtruth_neighbors(path, indices, n_base: int):
+    """Write a ground-truth neighbor matrix using the correct on-disk dtype."""
+    storage_dtype = neighbor_index_dtype(n_base)
+    data = np.asarray(indices, dtype=storage_dtype)
+    write_bin(path, data)
+
+
 def memmap_bin_file(
     bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
 ):
-    extent_itemsize = np.dtype(size_dtype).itemsize
-    offset = int(extent_itemsize) * 2
+    """Memory-map a cuvs-bench binary file.
+
+    Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and
+    the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read
+    mode the layout is auto-detected from the file size; in write mode the
+    legacy layout is used unless ``size_dtype=np.uint64`` or one of the shape
+    dimensions exceeds ``UINT32_MAX``.
+
+    Parameters
+    ----------
+    bin_file : str or None
+        Path to the binary file. ``None`` short-circuits and returns ``None``
+        (preserves the historical "skip optional file" behavior).
+    dtype : numpy dtype or None
+        Element dtype. If ``None``, inferred from the file extension via
+        :func:`dtype_from_filename`.
+    shape : tuple or None
+        Read mode: optionally override ``(n_rows, n_cols)`` from the header;
+        any ``None`` entries are filled in from the header value. Write mode:
+        required ``(n_rows, n_cols)`` of the file to create.
+    mode : str
+        Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``).
+    size_dtype : numpy dtype
+        Write mode only: ``np.uint32`` for the legacy 8-byte header (default),
+        or ``np.uint64`` to force the extended 16-byte header. Ignored in read
+        mode (auto-detected).
+    """
     if bin_file is None:
         return None
     if dtype is None:
         dtype = dtype_from_filename(bin_file)
+    itemsize = np.dtype(dtype).itemsize
+
+    if shape is not None and len(shape) != 2:
+        raise ValueError(
+            f"shape must have exactly 2 dimensions (n_rows, n_cols), got {shape!r}"
+        )
 
     if mode[0] == "r":
-        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize)
         if shape is None:
-            shape = (a[0], a[1])
+            final_shape = (n_rows, n_cols)
         else:
-            shape = tuple(
-                [
-                    aval if sval is None else sval
-                    for aval, sval in zip(a, shape)
-                ]
+            header_dims = (n_rows, n_cols)
+            final_shape = tuple(
+                aval if sval is None else sval
+                for aval, sval in zip(header_dims, shape)
             )
-
         return np.memmap(
-            bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
+            bin_file,
+            mode=mode,
+            dtype=dtype,
+            offset=header_bytes,
+            shape=final_shape,
         )
     elif mode[0] == "w":
         if shape is None:
@@ -72,19 +150,22 @@ def memmap_bin_file(
         dirname = os.path.dirname(bin_file)
         if len(dirname) > 0:
             os.makedirs(dirname, exist_ok=True)
-        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
-        a[0] = shape[0]
-        a[1] = shape[1]
-        a.flush()
-        del a
-        fp = np.memmap(
-            bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
+        with open(bin_file, "wb") as f:
+            header_bytes = write_bin_header(
+                f, shape[0], shape[1], size_dtype=size_dtype
+            )
+        return np.memmap(
+            bin_file,
+            mode="r+",
+            dtype=dtype,
+            offset=header_bytes,
+            shape=shape,
         )
-        return fp
 
 
 def write_bin(fname, data):
+    """Write a 2-D numpy array to a cuvs-bench binary file."""
     print("writing", fname, data.shape, data.dtype, "...")
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
index 5647ece771..0fba915cae 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -9,21 +9,27 @@
 
 import numpy as np
 
+from cuvs_bench._bin_format import read_bin_header, write_bin_header
+
 
 def read_fbin(fname):
-    shape = np.fromfile(fname, dtype=np.uint32, count=2)
-    if float(shape[0]) * shape[1] * 4 > 2_000_000_000:
-        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
-            shape
-        )
+    itemsize = np.dtype(np.float32).itemsize
+    n_rows, n_cols, header_bytes = read_bin_header(fname, itemsize)
+    shape = (n_rows, n_cols)
+    if float(n_rows) * n_cols * itemsize > 2_000_000_000:
+        data = np.memmap(
+            fname, dtype=np.float32, offset=header_bytes, mode="r"
+        ).reshape(shape)
     else:
-        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+        data = np.fromfile(
+            fname, dtype=np.float32, offset=header_bytes
+        ).reshape(shape)
     return data
 
 
 def write_bin(fname, data):
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
index af6a7aac31..2998a5dcb2 100644
--- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -8,6 +8,8 @@
 import h5py
 import numpy as np
 
+from cuvs_bench._bin_format import write_bin_header
+
 
 def normalize(x):
     norm = np.linalg.norm(x, axis=1)
@@ -16,7 +18,7 @@ def normalize(x):
 
 def write_bin(fname, data):
     with open(fname, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(f, data.shape[0], data.shape[1])
         data.tofile(f)
 
 
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
index db1fd7b137..01dd803a07 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -6,10 +6,19 @@
 Unit tests for shared backend utilities and Dataset transparent loading.
 """
 
+import struct
+
 import numpy as np
 import pytest
 import yaml
 
+from cuvs_bench._bin_format import (
+    EXTENDED_HEADER_BYTES,
+    LEGACY_HEADER_BYTES,
+    UINT32_MAX,
+    read_bin_header,
+    write_bin_header,
+)
 from cuvs_bench.backends import Dataset
 from cuvs_bench.backends._utils import (
     compute_recall,
@@ -17,13 +26,23 @@
     expand_param_grid,
     load_vectors,
 )
+from cuvs_bench.generate_groundtruth.utils import (
+    groundtruth_neighbors_filename,
+    memmap_bin_file,
+    neighbor_index_accumulator_dtype,
+    neighbor_index_dtype,
+    offset_neighbor_indices,
+    write_groundtruth_neighbors,
+)
 from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
 
 
-def _write_test_bin(path, data):
-    """Write a numpy array in big-ann-bench binary format."""
+def _write_test_bin(path, data, *, size_dtype=np.uint32):
+    """Write a numpy array in cuvs-bench binary format."""
     with open(path, "wb") as f:
-        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        write_bin_header(
+            f, data.shape[0], data.shape[1], size_dtype=size_dtype
+        )
         data.tofile(f)
 
 
@@ -42,6 +61,10 @@ def test_ibin(self):
         """Test .ibin maps to int32."""
         assert dtype_from_filename("groundtruth.ibin") == np.int32
 
+    def test_u64bin(self):
+        """Test .u64bin maps to uint64."""
+        assert dtype_from_filename("groundtruth.neighbors.u64bin") == np.uint64
+
     def test_u8bin(self):
         """Test .u8bin maps to uint8."""
         assert dtype_from_filename("vectors.u8bin") == np.ubyte
@@ -170,7 +193,7 @@ def test_truncated_data(self, tmp_path):
             np.array([10, 4], dtype=np.uint32).tofile(f)
             np.random.rand(5, 4).astype(np.float32).tofile(f)
 
-        with pytest.raises(ValueError, match="File is truncated"):
+        with pytest.raises(ValueError, match="does not match either"):
             load_vectors(path)
 
     def test_file_not_found(self):
@@ -178,6 +201,322 @@ def test_file_not_found(self):
         with pytest.raises(FileNotFoundError):
             load_vectors("/nonexistent/path/vectors.fbin")
 
+    def test_load_uint64_header(self, tmp_path):
+        """``load_vectors`` reads files written with the extended uint64 header."""
+        data = np.random.rand(40, 16).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, size_dtype=np.uint64)
+
+        # Sanity check: file really uses the extended layout.
+        assert (
+            tmp_path.joinpath("test.fbin").stat().st_size
+            == EXTENDED_HEADER_BYTES + data.nbytes
+        )
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_load_uint64_header_with_subset(self, tmp_path):
+        """``subset_size`` works regardless of which header layout was used."""
+        data = np.random.rand(50, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, size_dtype=np.uint64)
+
+        loaded = load_vectors(path, subset_size=12)
+        assert loaded.shape == (12, 8)
+        np.testing.assert_array_equal(loaded, data[:12])
+
+    @pytest.mark.parametrize(
+        "ext, dtype, size_dtype",
+        [
+            (".fbin", np.float32, np.uint32),
+            (".fbin", np.float32, np.uint64),
+            (".f16bin", np.float16, np.uint32),
+            (".f16bin", np.float16, np.uint64),
+            (".ibin", np.int32, np.uint32),
+            (".ibin", np.int32, np.uint64),
+            (".u64bin", np.uint64, np.uint32),
+            (".u64bin", np.uint64, np.uint64),
+            (".u8bin", np.uint8, np.uint32),
+            (".u8bin", np.uint8, np.uint64),
+            (".i8bin", np.int8, np.uint32),
+            (".i8bin", np.int8, np.uint64),
+        ],
+    )
+    def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype):
+        """Round-trip every supported dtype through both header layouts."""
+        if np.issubdtype(dtype, np.integer):
+            info = np.iinfo(dtype)
+            data = np.random.randint(
+                info.min, info.max, size=(25, 7), dtype=dtype
+            )
+            if dtype == np.uint64:
+                data[0, 0] = np.iinfo(np.int32).max + 42
+        else:
+            data = np.random.rand(25, 7).astype(dtype)
+        path = str(tmp_path / f"test{ext}")
+        _write_test_bin(path, data, size_dtype=size_dtype)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+
+class TestGroundtruthNeighborFormat:
+    """Tests for large-base ground-truth neighbor index format selection."""
+
+    def test_neighbor_index_dtype_small_base(self):
+        assert neighbor_index_dtype(1_000_000) == np.int32
+
+    def test_neighbor_index_dtype_large_base(self):
+        assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64
+
+    def test_neighbor_index_accumulator_dtype_large_base(self):
+        assert (
+            neighbor_index_accumulator_dtype(np.iinfo(np.int32).max + 1)
+            == np.int64
+        )
+
+    def test_groundtruth_neighbors_filename_small_base(self):
+        assert (
+            groundtruth_neighbors_filename(1_000_000)
+            == "groundtruth.neighbors.ibin"
+        )
+
+    def test_groundtruth_neighbors_filename_large_base(self):
+        assert (
+            groundtruth_neighbors_filename(np.iinfo(np.int32).max + 1)
+            == "groundtruth.neighbors.u64bin"
+        )
+
+    def test_load_u64bin_preserves_large_indices(self, tmp_path):
+        """uint64 GT files preserve neighbor IDs above INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 12345
+        indices = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        path = str(tmp_path / "gt.u64bin")
+        _write_test_bin(path, indices)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, indices)
+
+    def test_offset_neighbor_indices_small_base(self):
+        local = np.array([[0, 1, 2]], dtype=np.uint32)
+        offset = offset_neighbor_indices(local, 1000, 1_000_000)
+        assert offset.dtype == np.int32
+        np.testing.assert_array_equal(offset, [[1000, 1001, 1002]])
+
+    def test_offset_neighbor_indices_large_batch_offset(self):
+        """Search-local IDs must not wrap when batch offset exceeds INT32_MAX."""
+        batch_offset = np.iinfo(np.int32).max + 1
+        n_base = batch_offset + 10
+        local = np.array([[0, 1, 2]], dtype=np.int64)
+        offset = offset_neighbor_indices(local, batch_offset, n_base)
+        assert offset.dtype == np.int64
+        np.testing.assert_array_equal(
+            offset,
+            [[batch_offset, batch_offset + 1, batch_offset + 2]],
+        )
+
+    def test_write_groundtruth_neighbors_round_trip(self, tmp_path):
+        """GT write/load preserves neighbor IDs above INT32_MAX."""
+        n_base = np.iinfo(np.int32).max + 1
+        large_id = n_base + 999
+        indices = np.array([[large_id, large_id - 1, 0]], dtype=np.int64)
+        path = str(tmp_path / groundtruth_neighbors_filename(n_base))
+        write_groundtruth_neighbors(path, indices, n_base)
+
+        loaded = load_vectors(path)
+        assert loaded.dtype == np.uint64
+        np.testing.assert_array_equal(loaded, indices.astype(np.uint64))
+
+    def test_dataset_lazy_load_u64bin_groundtruth(self, tmp_path):
+        """Dataset loads .u64bin ground truth with IDs above INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 12345
+        gt = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        path = str(tmp_path / "groundtruth.neighbors.u64bin")
+        _write_test_bin(path, gt)
+
+        dataset = Dataset(name="test", groundtruth_neighbors_file=path)
+        np.testing.assert_array_equal(dataset.groundtruth_neighbors, gt)
+
+
+class TestBinHeaderHelpers:
+    """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``."""
+
+    def test_write_legacy_returns_8_bytes(self, tmp_path):
+        """Small shapes should write the 8-byte uint32 header by default."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, 7, 3)
+        assert n == LEGACY_HEADER_BYTES
+        assert path.stat().st_size == LEGACY_HEADER_BYTES
+
+    def test_write_size_dtype_uint64_returns_16_bytes(self, tmp_path):
+        """``size_dtype=np.uint64`` should write the 16-byte uint64 header."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, 7, 3, size_dtype=np.uint64)
+        assert n == EXTENDED_HEADER_BYTES
+        assert path.stat().st_size == EXTENDED_HEADER_BYTES
+
+    def test_write_auto_promotes_to_uint64_when_overflowing(self, tmp_path):
+        """Shapes that don't fit in uint32 should auto-promote to uint64."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            n = write_bin_header(f, UINT32_MAX + 1, 4)
+        assert n == EXTENDED_HEADER_BYTES
+
+    def test_write_negative_raises(self, tmp_path):
+        """Negative dimensions are rejected."""
+        path = tmp_path / "h.bin"
+        with open(path, "wb") as f:
+            with pytest.raises(ValueError, match="non-negative"):
+                write_bin_header(f, -1, 4)
+
+    def test_read_legacy_round_trip(self, tmp_path):
+        """Legacy round-trip: write 8-byte header, read it back."""
+        path = tmp_path / "x.fbin"
+        data = np.random.rand(11, 5).astype(np.float32)
+        with open(path, "wb") as f:
+            write_bin_header(f, data.shape[0], data.shape[1])
+            data.tofile(f)
+        n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert (n_rows, n_cols, hbytes) == (11, 5, LEGACY_HEADER_BYTES)
+
+    def test_read_extended_round_trip(self, tmp_path):
+        """Extended round-trip: write 16-byte header, read it back."""
+        path = tmp_path / "x.fbin"
+        data = np.random.rand(11, 5).astype(np.float32)
+        with open(path, "wb") as f:
+            write_bin_header(
+                f, data.shape[0], data.shape[1], size_dtype=np.uint64
+            )
+            data.tofile(f)
+        n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4)
+        assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES)
+
+    def test_read_synthesized_huge_extended_header(self, tmp_path):
+        """Extended-header file with >UINT32_MAX rows and positive n_cols.
+
+        We can't materialize the full data section, so write the header and
+        truncate to the exact file size ``read_bin_header`` expects.
+        """
+        path = tmp_path / "huge.fbin"
+        n_rows = UINT32_MAX + 17
+        n_cols = 4
+        itemsize = 4
+        expected_size = EXTENDED_HEADER_BYTES + n_rows * n_cols * itemsize
+        with open(path, "wb") as f:
+            write_bin_header(f, n_rows, n_cols)
+            f.truncate(expected_size)
+
+        assert path.stat().st_size == expected_size
+        got_rows, got_cols, hbytes = read_bin_header(
+            str(path), itemsize=itemsize
+        )
+        assert got_rows == n_rows
+        assert got_cols == n_cols
+        assert hbytes == EXTENDED_HEADER_BYTES
+
+    def test_read_file_too_small_raises(self, tmp_path):
+        """A file shorter than the legacy header raises a clear error."""
+        path = tmp_path / "x.fbin"
+        path.write_bytes(b"\x00\x00\x00")
+        with pytest.raises(ValueError, match="File too small"):
+            read_bin_header(str(path), itemsize=4)
+
+    def test_read_size_mismatch_raises(self, tmp_path):
+        """Header values that don't balance the file size are rejected."""
+        path = tmp_path / "x.fbin"
+        with open(path, "wb") as f:
+            f.write(struct.pack("<II", 10, 4))
+            f.write(b"\x00" * (5 * 4 * 4))
+        with pytest.raises(ValueError, match="does not match either"):
+            read_bin_header(str(path), itemsize=4)
+
+    def test_read_dispatch_prefers_legacy(self, tmp_path):
+        """When the legacy interpretation balances, it wins.
+
+        Guards against accidentally treating a small legacy file as
+        extended (which would silently mis-interpret the first 16 bytes
+        as two uint64s).
+        """
+        path = tmp_path / "x.fbin"
+        data = np.arange(12, dtype=np.float32).reshape(3, 4)
+        with open(path, "wb") as f:
+            write_bin_header(f, 3, 4)
+            data.tofile(f)
+        _, _, hbytes = read_bin_header(str(path), itemsize=4)
+        assert hbytes == LEGACY_HEADER_BYTES
+
+
+class TestMemmapBinFile:
+    """Tests for ``generate_groundtruth.utils.memmap_bin_file``."""
+
+    def test_read_legacy_header(self, tmp_path):
+        """Read mode auto-detects the legacy 8-byte header offset."""
+        data = np.random.rand(30, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        mm = memmap_bin_file(path, np.float32, mode="r")
+        assert mm.shape == (30, 8)
+        np.testing.assert_array_equal(mm[:], data)
+
+    def test_read_extended_header(self, tmp_path):
+        """Read mode auto-detects the extended 16-byte header offset."""
+        data = np.random.rand(30, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data, size_dtype=np.uint64)
+
+        mm = memmap_bin_file(path, np.float32, mode="r")
+        assert mm.shape == (30, 8)
+        np.testing.assert_array_equal(mm[:], data)
+
+    def test_read_partial_shape_override(self, tmp_path):
+        """Read mode fills ``None`` shape entries from the header."""
+        data = np.random.rand(50, 8).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        mm = memmap_bin_file(path, np.float32, shape=(10, None), mode="r")
+        assert mm.shape == (10, 8)
+        np.testing.assert_array_equal(mm[:], data[:10])
+
+    def test_write_read_roundtrip_legacy(self, tmp_path):
+        """Write mode with uint32 header, then read back via memmap."""
+        path = str(tmp_path / "test.fbin")
+        shape = (20, 8)
+        data = np.random.rand(*shape).astype(np.float32)
+
+        mm = memmap_bin_file(path, np.float32, shape=shape, mode="w+")
+        mm[:] = data
+        mm.flush()
+        del mm
+
+        loaded = memmap_bin_file(path, np.float32, mode="r")
+        assert loaded.shape == shape
+        np.testing.assert_array_equal(loaded[:], data)
+
+    def test_write_read_roundtrip_extended(self, tmp_path):
+        """Write mode with uint64 header, then read back via memmap."""
+        path = str(tmp_path / "test.fbin")
+        shape = (20, 8)
+        data = np.random.rand(*shape).astype(np.float32)
+
+        mm = memmap_bin_file(
+            path, np.float32, shape=shape, mode="w+", size_dtype=np.uint64
+        )
+        mm[:] = data
+        mm.flush()
+        del mm
+
+        loaded = memmap_bin_file(path, np.float32, mode="r")
+        assert loaded.shape == shape
+        np.testing.assert_array_equal(loaded[:], data)
+        assert (
+            tmp_path.joinpath("test.fbin").stat().st_size
+            == EXTENDED_HEADER_BYTES + data.nbytes
+        )
+
 
 class TestDatasetLazyLoading:
     """Tests for Dataset transparent vector loading."""
@@ -430,3 +769,10 @@ def test_multiple_queries(self):
         groundtruth = np.array([[0, 1, 2], [3, 4, 5]])
         recall = compute_recall(neighbors, groundtruth, k=3)
         assert abs(recall - 5.0 / 6.0) < 1e-9
+
+    def test_large_uint64_neighbor_ids(self):
+        """Recall works when GT neighbor IDs exceed INT32_MAX."""
+        large_id = np.iinfo(np.int32).max + 999
+        neighbors = np.array([[large_id, 0, 1]], dtype=np.int64)
+        groundtruth = np.array([[large_id, 0, 1]], dtype=np.uint64)
+        assert compute_recall(neighbors, groundtruth, k=3) == 1.0