Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions python/cuvs_bench/cuvs_bench/_bin_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#

"""
On-disk header helpers for the cuvs-bench binary file format.

cuvs-bench inherits the big-ann-benchmarks binary layout: a small header
listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols``
array of the dtype implied by the file extension. Two layouts are supported:

- **Legacy**: ``[uint32 n_rows, uint32 n_cols, data ...]`` (8-byte header).
This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin``
/ ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today.

- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]`` (16-byte header).
For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B).

Detection is **size-based**: a well-formed cuvs-bench binary is exactly
``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes
of the file and:

1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte
header). The layout is accepted if ``8 + n_rows * n_cols * itemsize``
matches the on-disk file size.
2. Otherwise tries the extended layout (first 16 bytes as two
``uint64``s, 16-byte header). Accepted if
``16 + n_rows * n_cols * itemsize`` matches the file size instead.
3. If neither layout matches, raises ``ValueError`` -- the file is
truncated, padded, or has a mismatched dtype extension.
"""

from __future__ import annotations

import os
import struct
from typing import BinaryIO, Tuple

import numpy as np

UINT32_MAX = (1 << 32) - 1

LEGACY_HEADER_BYTES = 8
EXTENDED_HEADER_BYTES = 16


def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
"""Read the header of a cuvs-bench binary file.

Auto-detects the on-disk layout from the file size by checking which
of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64
header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize``
balance.

Parameters
----------
path : str
Path to the binary file.
itemsize : int
Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for
``int8``) used for the size-equation check.

Returns
-------
(n_rows, n_cols, header_bytes) : Tuple[int, int, int]
Row count, column count, and the number of bytes the header
occupies on disk (``8`` for legacy, ``16`` for extended).

Raises
------
ValueError
If neither the legacy nor the extended interpretation matches.
FileNotFoundError
If ``path`` does not exist.
"""
if itemsize < 1:
raise ValueError(
f"itemsize must be a positive integer, got {itemsize!r}"
)
file_size = os.path.getsize(path)
with open(path, "rb") as f:
head = f.read(EXTENDED_HEADER_BYTES)

if len(head) < LEGACY_HEADER_BYTES:
raise ValueError(
f"File too small to contain a valid header (expected at least "
f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}"
)

n_rows_32, n_cols_32 = struct.unpack("<II", head[:LEGACY_HEADER_BYTES])
if file_size == LEGACY_HEADER_BYTES + n_rows_32 * n_cols_32 * itemsize:
return int(n_rows_32), int(n_cols_32), LEGACY_HEADER_BYTES
Comment thread
jinsolp marked this conversation as resolved.

if len(head) == EXTENDED_HEADER_BYTES:
n_rows_64, n_cols_64 = struct.unpack("<QQ", head)
if (
file_size
== EXTENDED_HEADER_BYTES + n_rows_64 * n_cols_64 * itemsize
):
return int(n_rows_64), int(n_cols_64), EXTENDED_HEADER_BYTES

raise ValueError(
f"File size {file_size:,} bytes does not match either the legacy "
f"(8-byte uint32) or extended (16-byte uint64) header layout for "
f"itemsize={itemsize}: {path}. The file may be truncated, padded, "
f"or have a mismatched dtype extension."
)


def write_bin_header(
f: BinaryIO,
n_rows: int,
n_cols: int,
*,
size_dtype=np.uint32,
) -> int:
"""Write the canonical cuvs-bench binary header at the current position.

The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and
``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used
otherwise, or when explicitly requested via ``size_dtype=np.uint64``.

Parameters
----------
f : BinaryIO
Open binary file handle, positioned where the header should go.
n_rows, n_cols : int
Header values to write. Must be non-negative.
size_dtype : numpy dtype
``np.uint32`` for the legacy 8-byte header (default), or
``np.uint64`` to force the extended 16-byte header.

Returns
-------
int
Number of bytes written (``8`` for legacy, ``16`` for extended).
"""
if n_rows < 0 or n_cols < 0:
raise ValueError(
f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})"
)
use_uint64 = (
np.dtype(size_dtype) == np.uint64
or n_rows > UINT32_MAX
or n_cols > UINT32_MAX
)
if use_uint64:
f.write(struct.pack("<QQ", int(n_rows), int(n_cols)))
return EXTENDED_HEADER_BYTES
f.write(struct.pack("<II", int(n_rows), int(n_cols)))
Comment thread
jinsolp marked this conversation as resolved.
return LEGACY_HEADER_BYTES
38 changes: 20 additions & 18 deletions python/cuvs_bench/cuvs_bench/backends/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import numpy as np

from cuvs_bench._bin_format import read_bin_header


def dtype_from_filename(filename):
"""Map file extension to numpy dtype.
Expand All @@ -53,6 +55,8 @@ def dtype_from_filename(filename):
return np.float16
elif ext == ".ibin":
return np.int32
elif ext == ".u64bin":
return np.uint64
elif ext == ".u8bin":
return np.ubyte
elif ext == ".i8bin":
Expand All @@ -65,17 +69,18 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
"""
Read a binary vector file into a numpy array.

Supports the standard big-ann-bench binary format used by cuvs-bench
datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
followed by ``n_rows * n_cols`` elements of the dtype inferred from
the file extension via ``dtype_from_filename``.
Supports the cuvs-bench binary format with either the legacy 8-byte
``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte
``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more
than ``UINT32_MAX`` rows or columns. The layout is auto-detected from
the file size by :func:`cuvs_bench._bin_format.read_bin_header`.

Parameters
----------
path : str
Path to the binary file. The dtype is inferred from the extension:
``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
``.i8bin`` (int8), ``.ibin`` (int32).
``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64).
subset_size : Optional[int]
If provided, only the first ``subset_size`` rows are loaded.

Expand All @@ -93,27 +98,24 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
or the file is truncated.
"""
dtype = dtype_from_filename(path)
if subset_size is not None and subset_size < 1:
itemsize = np.dtype(dtype).itemsize
if subset_size is not None and (
isinstance(subset_size, float) or subset_size < 1
):
raise ValueError(
f"subset_size must be a positive integer, got {subset_size}"
)
n_rows, n_cols, header_bytes = read_bin_header(path, itemsize)
if subset_size is not None:
n_rows = min(n_rows, subset_size)
Comment thread
jinsolp marked this conversation as resolved.
with open(path, "rb") as f:
header = f.read(8)
if len(header) < 8:
raise ValueError(
f"File too small to contain a valid header (expected 8 bytes, "
f"got {len(header)}): {path}"
)
n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
if subset_size is not None:
n_rows = min(n_rows, subset_size)
expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
f.seek(header_bytes)
expected_bytes = n_rows * n_cols * itemsize
raw = f.read(expected_bytes)
if len(raw) < expected_bytes:
raise ValueError(
f"File is truncated: expected {expected_bytes} bytes of data "
f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), "
f"got {len(raw)}: {path}"
)
data = np.frombuffer(raw, dtype=dtype)
Expand Down
19 changes: 14 additions & 5 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,14 @@
import sys
import warnings

from .utils import memmap_bin_file, suffix_from_dtype, write_bin
from .utils import (
groundtruth_neighbors_filename,
memmap_bin_file,
offset_neighbor_indices,
suffix_from_dtype,
write_bin,
write_groundtruth_neighbors,
)


def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
Expand Down Expand Up @@ -193,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
D, Ind = cpu_search(X, queries, k, metric=metric)

D, Ind = xp.asarray(D), xp.asarray(Ind)
Ind += i # shift neighbor index by offset i
Ind = offset_neighbor_indices(Ind, i, n_samples)

if distances is None:
distances = D
Expand Down Expand Up @@ -358,9 +365,11 @@ def main():
print("Calculating true nearest neighbors")
distances, indices = calc_truth(dataset, queries, args.k, args.metric)

write_bin(
os.path.join(args.output, "groundtruth.neighbors.ibin"),
indices.astype(xp.uint32),
n_base = dataset.shape[0]
write_groundtruth_neighbors(
os.path.join(args.output, groundtruth_neighbors_filename(n_base)),
indices,
n_base,
)
write_bin(
os.path.join(args.output, "groundtruth.distances.fbin"),
Expand Down
Loading
Loading