Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
797e665
Initial addition of numpy ndarrays to BinaryVector. New tests
caseyclements Oct 15, 2025
47fc92c
Updates changelog
caseyclements Oct 15, 2025
7882f75
Typing
caseyclements Oct 15, 2025
b7556fb
Added numpy extra to typing_run
caseyclements Oct 15, 2025
5753e3b
Adds numpy.typing.NDArray
caseyclements Oct 15, 2025
d3407d7
Adds numpy.typing.NDArray
caseyclements Oct 15, 2025
9ae90e8
Removed match/case that will have to wait.
caseyclements Oct 15, 2025
aae159f
Put guard around check involving numpy
caseyclements Oct 16, 2025
9fadf97
Moved test_vector_from_numpy to end of vector tests
caseyclements Oct 16, 2025
40120e7
Fixed link in BinaryVector docstring
caseyclements Oct 16, 2025
be06ce7
Tiny adjustment of BinaryVector docstring
caseyclements Oct 16, 2025
f03b943
Convert Assertion to Value and Type Errors
caseyclements Oct 17, 2025
3cc5041
Numpy now lazily imported. For typing, removed numpy extra. justfile …
caseyclements Oct 17, 2025
e3b894b
Added validation in as_numpy_vector
caseyclements Oct 21, 2025
0b0a50b
Merge remote-tracking branch 'origin/master' into INTPYTHON-5355-Nump…
caseyclements Oct 27, 2025
10da245
Update bson/binary.py
caseyclements Oct 27, 2025
73910ce
PYTHON-5628 - Update the link for help in the documentation (#2602)
NoahStapp Oct 27, 2025
8dec0d3
Renamed just target test-bson to test-numpy
caseyclements Oct 27, 2025
9420ec1
as_numpy_vector refactored to as_vector(return_numpy=True)
caseyclements Oct 29, 2025
b8d9719
Checkpoint - working on numpy test variants
caseyclements Nov 25, 2025
cd053fc
Explicitly assert that data in BinaryVector.as_vector(return_numpy=Tr…
caseyclements Dec 3, 2025
f494af9
Improved TypeError message.
caseyclements Dec 3, 2025
a2eb6f4
Configure evergreen: adds a new function, and 5 variants each contai…
caseyclements Dec 3, 2025
b33ed17
Sync feature with master
caseyclements Dec 3, 2025
591d12e
Fixed typo in changelog merge
caseyclements Dec 3, 2025
b46468f
Added python versions to generated test configs of test-numpy
caseyclements Dec 4, 2025
1a408cf
Adjusting task naming for selectors
caseyclements Dec 4, 2025
56f52f6
Removed pypy from test matrix of test-numpy
caseyclements Dec 4, 2025
7576780
Add pr tag just to rhel8
caseyclements Dec 4, 2025
f77ac9b
Merge remote-tracking branch 'upstream/master' into INTPYTHON-5355-Nu…
caseyclements Dec 4, 2025
150725a
Remove stale todo as we did not add a pytest marker
caseyclements Dec 4, 2025
a7d1210
Update doc/changelog.rst
Jibola Dec 5, 2025
178403d
Remove unused args from just test-numpy
caseyclements Dec 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .evergreen/generated_configs/tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4768,6 +4768,14 @@ tasks:
- noauth
- pypy

# Test numpy tests
- name: test-numpy
commands:
- func: run test numpy
vars:
TEST_NAME: test_numpy
tags: [binary, vector]

# Test standard auth tests
- name: test-standard-auth-v4.2-python3.10-auth-ssl-sharded-cluster
commands:
Expand Down
43 changes: 43 additions & 0 deletions .evergreen/generated_configs/variants.yml
Original file line number Diff line number Diff line change
Expand Up @@ -612,3 +612,46 @@ buildvariants:
- rhel87-small
expansions:
STORAGE_ENGINE: inmemory

# Test numpy tests
- name: test-numpy-rhel8
tasks:
- name: .test-numpy
display_name: Test Numpy RHEL8
run_on:
- rhel87-small
tags: [binary-vector]
- name: test-numpy-macos
tasks:
- name: .test-numpy
display_name: Test Numpy macOS
run_on:
- macos-14
tags: [binary-vector]
- name: test-numpy-macos-arm64
tasks:
- name: .test-numpy !.pypy .server-6.0
- name: .test-numpy !.pypy .server-7.0
- name: .test-numpy !.pypy .server-8.0
- name: .test-numpy !.pypy .server-rapid
- name: .test-numpy !.pypy .server-latest
display_name: Test Numpy macOS Arm64
run_on:
- macos-14-arm64
tags: [binary-vector]
- name: test-numpy-win64
tasks:
- name: .test-numpy
display_name: Test Numpy Win64
run_on:
- windows-64-vsMulti-small
tags: [binary-vector]
- name: test-numpy-win32
tasks:
- name: .test-numpy
display_name: Test Numpy Win32
run_on:
- windows-64-vsMulti-small
expansions:
IS_WIN32: "1"
tags: [binary-vector]
35 changes: 35 additions & 0 deletions .evergreen/scripts/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,38 @@ def create_disable_test_commands_variants():
return [create_variant(tasks, display_name, host=host, expansions=expansions)]


def create_test_numpy_tasks():
vars = dict(TEST_NAME="test_numpy")
test_func = FunctionCall(func="run test numpy", vars=vars)
task_name = "test-numpy"
tags = ["binary", "vector"]
return [EvgTask(name=task_name, tags=tags, commands=[test_func])]


def create_test_numpy_variants() -> list[BuildVariant]:
variants = []
base_display_name = "Test Numpy"

# Test a subset on each of the other platforms.
for host_name in ("rhel8", "macos", "macos-arm64", "win64", "win32"):
tasks = [".test-numpy"]
# MacOS arm64 only works on server versions 6.0+
if host_name == "macos-arm64":
tasks = [
f".test-numpy !.pypy .server-{version}" for version in get_versions_from("6.0")
]
host = HOSTS[host_name]
tags = ["binary-vector"]
expansions = dict()
if host_name == "win32":
expansions["IS_WIN32"] = "1"
display_name = get_variant_name(base_display_name, host)
variant = create_variant(tasks, display_name, host=host, tags=tags, expansions=expansions)
variants.append(variant)

return variants


def create_oidc_auth_variants():
variants = []
for host_name in ["ubuntu22", "macos", "win64"]:
Expand Down Expand Up @@ -1174,3 +1206,6 @@ def create_send_dashboard_data_func():
write_variants_to_file(mod)
write_tasks_to_file(mod)
write_functions_to_file(mod)

# TODO - Create a new variant here that drives run-test
# Workfromrove
4 changes: 4 additions & 0 deletions .evergreen/scripts/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,7 @@ def run() -> None:

if __name__ == "__main__":
run()


# TODO - Make changes here to create a pytest marker that runs as desired
# Use encryption as example
212 changes: 149 additions & 63 deletions bson/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@
from array import array as _array
from mmap import mmap as _mmap

import numpy as np
import numpy.typing as npt


class UuidRepresentation:
UNSPECIFIED = 0
Expand Down Expand Up @@ -234,13 +237,20 @@ class BinaryVector:

__slots__ = ("data", "dtype", "padding")

def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
def __init__(
self,
data: Union[Sequence[float | int], npt.NDArray[np.number]],
dtype: BinaryVectorDtype,
padding: int = 0,
):
"""
:param data: Sequence of numbers representing the mathematical vector.
:param dtype: The data type stored in binary
:param padding: The number of bits in the final byte that are to be ignored
when a vector element's size is less than a byte
and the length of the vector is not a multiple of 8.
(Padding is equivalent to a negative value of `count` in
`numpy.unpackbits <https://numpy.org/doc/stable/reference/generated/numpy.unpackbits.html>`_)
"""
self.data = data
self.dtype = dtype
Expand Down Expand Up @@ -425,9 +435,19 @@ def from_vector(
...

@classmethod
@overload
def from_vector(
cls: Type[Binary],
vector: Union[BinaryVector, list[int], list[float]],
vector: npt.NDArray[np.number],
dtype: BinaryVectorDtype,
padding: int = 0,
) -> Binary:
...

@classmethod
def from_vector(
cls: Type[Binary],
vector: Union[BinaryVector, list[int], list[float], npt.NDArray[np.number]],
dtype: Optional[BinaryVectorDtype] = None,
padding: Optional[int] = None,
) -> Binary:
Expand Down Expand Up @@ -459,34 +479,70 @@ def from_vector(
vector = vector.data # type: ignore

padding = 0 if padding is None else padding
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
if 0 <= padding > 7:
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
else:
raise NotImplementedError("%s not yet supported" % dtype)

if not isinstance(dtype, BinaryVectorDtype):
raise TypeError(
"dtype must be a bson.BinaryVectorDtype of BinaryVectorDType.INT8, PACKED_BIT, FLOAT32"
)
metadata = struct.pack("<sB", dtype.value, padding)
data = struct.pack(f"<{len(vector)}{format_str}", *vector) # type: ignore

if isinstance(vector, list):
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
if 0 <= padding > 7:
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
else:
raise NotImplementedError("%s not yet supported" % dtype)
data = struct.pack(f"<{len(vector)}{format_str}", *vector)
else: # vector is numpy array or incorrect type.
try:
import numpy as np
except ImportError as exc:
raise ImportError(
"Failed to create binary from vector. Check type. If numpy array, numpy must be installed."
) from exc
if not isinstance(vector, np.ndarray):
raise TypeError("Vector must be a numpy array.")
if vector.ndim != 1:
raise ValueError(
"from_numpy_vector only supports 1D arrays as it creates a single vector."
)

if dtype == BinaryVectorDtype.FLOAT32:
vector = vector.astype(np.dtype("float32"), copy=False)
elif dtype == BinaryVectorDtype.INT8:
if vector.min() >= -128 and vector.max() <= 127:
vector = vector.astype(np.dtype("int8"), copy=False)
else:
raise ValueError("Values found outside INT8 range.")
elif dtype == BinaryVectorDtype.PACKED_BIT:
if vector.min() >= 0 and vector.max() <= 127:
vector = vector.astype(np.dtype("uint8"), copy=False)
else:
raise ValueError("Values found outside UINT8 range.")
else:
raise NotImplementedError("%s not yet supported" % dtype)
data = vector.tobytes()

if padding and len(vector) and not (data[-1] & ((1 << padding) - 1)) == 0:
raise ValueError(
"Vector has a padding P, but bits in the final byte lower than P are non-zero. They must be zero."
)
return cls(metadata + data, subtype=VECTOR_SUBTYPE)

def as_vector(self) -> BinaryVector:
"""From the Binary, create a list of numbers, along with dtype and padding.
def as_vector(self, return_numpy: bool = False) -> BinaryVector:
"""From the Binary, create a list or 1-d numpy array of numbers, along with dtype and padding.

:param return_numpy: If True, BinaryVector.data will be a one-dimensional numpy array. By default, it is a list.
:return: BinaryVector

.. versionadded:: 4.10
Expand All @@ -495,54 +551,84 @@ def as_vector(self) -> BinaryVector:
if self.subtype != VECTOR_SUBTYPE:
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector")

position = 0
dtype, padding = struct.unpack_from("<sB", self, position)
position += 2
dtype, padding = struct.unpack_from("<sB", self)
dtype = BinaryVectorDtype(dtype)
n_values = len(self) - position
offset = 2
n_bytes = len(self) - offset

if padding and dtype != BinaryVectorDtype.PACKED_BIT:
raise ValueError(
f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})"
)

if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"<{n_values}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, position))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.FLOAT32:
n_bytes = len(self) - position
n_values = n_bytes // 4
if n_bytes % 4:
raise ValueError(
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
)
dtype_format = "f"
format_string = f"<{n_values}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, position))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.PACKED_BIT:
# data packed as uint8
if padding and not n_values:
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
if padding > 7 or padding < 0:
raise ValueError(f"Corrupt data. Padding ({padding}) must be between 0 and 7.")
dtype_format = "B"
format_string = f"<{n_values}{dtype_format}"
unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
if padding and n_values and unpacked_uint8s[-1] & (1 << padding) - 1 != 0:
warnings.warn(
"Vector has a padding P, but bits in the final byte lower than P are non-zero. For pymongo>=5.0, they must be zero.",
DeprecationWarning,
stacklevel=2,
)
return BinaryVector(unpacked_uint8s, dtype, padding)

else:
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
if not return_numpy:
if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"<{n_bytes}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, offset))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.FLOAT32:
n_values = n_bytes // 4
if n_bytes % 4:
raise ValueError(
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
)
dtype_format = "f"
format_string = f"<{n_values}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, offset))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.PACKED_BIT:
# data packed as uint8
if padding and not n_bytes:
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
if padding > 7 or padding < 0:
raise ValueError(f"Corrupt data. Padding ({padding}) must be between 0 and 7.")
dtype_format = "B"
format_string = f"<{n_bytes}{dtype_format}"
unpacked_uint8s = list(struct.unpack_from(format_string, self, offset))
if padding and n_bytes and unpacked_uint8s[-1] & (1 << padding) - 1 != 0:
warnings.warn(
"Vector has a padding P, but bits in the final byte lower than P are non-zero. For pymongo>=5.0, they must be zero.",
DeprecationWarning,
stacklevel=2,
)
return BinaryVector(unpacked_uint8s, dtype, padding)

else:
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
else: # create a numpy array
try:
import numpy as np
except ImportError as exc:
raise ImportError(
"Converting binary to numpy.ndarray requires numpy to be installed."
) from exc
if dtype == BinaryVectorDtype.INT8:
data = np.frombuffer(self[offset:], dtype="int8")
elif dtype == BinaryVectorDtype.FLOAT32:
if n_bytes % 4:
raise ValueError(
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
)
data = np.frombuffer(self[offset:], dtype="float32")
elif dtype == BinaryVectorDtype.PACKED_BIT:
# data packed as uint8
if padding and not n_bytes:
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
if padding > 7 or padding < 0:
raise ValueError(f"Corrupt data. Padding ({padding}) must be between 0 and 7.")
data = np.frombuffer(self[offset:], dtype="uint8")
if padding and np.unpackbits(data[-1])[-padding:].sum() > 0:
warnings.warn(
"Vector has a padding P, but bits in the final byte lower than P are non-zero. For pymongo>=5.0, they must be zero.",
DeprecationWarning,
stacklevel=2,
)
else:
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
return BinaryVector(data, dtype, padding)

@property
def subtype(self) -> int:
Expand Down
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ PyMongo 4.16 brings a number of changes including:
Python 3.10+. The minimum version is ``2.6.1`` to account for `CVE-2023-29483 <https://www.cve.org/CVERecord?id=CVE-2023-29483>`_.
- Removed support for Eventlet.
Eventlet is actively being sunset by its maintainers and has compatibility issues with PyMongo's dnspython dependency.
- Added support for NumPy 1D-arrays in BSON Binary Vectors.

Changes in Version 4.15.3 (2025/10/07)
--------------------------------------
Expand Down
Loading