Skip to content

Commit 033923b

Browse files
committed
Improved internal storage format
1 parent ae0f549 commit 033923b

4 files changed

Lines changed: 66 additions & 58 deletions

File tree

pgvector/halfvec.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,63 @@
11
from __future__ import annotations
22
import numpy as np
3-
from struct import pack, unpack_from
3+
import struct
44

55

66
class HalfVector:
77
def __init__(self, value: list[float] | np.ndarray[tuple[int], np.dtype[np.floating]]) -> None:
8-
# asarray still copies if same dtype
9-
if not isinstance(value, np.ndarray) or value.dtype != '>f2':
10-
value = np.asarray(value, dtype='>f2')
11-
12-
# for mypy
13-
assert isinstance(value, np.ndarray)
14-
15-
if value.ndim != 1:
16-
raise ValueError('expected ndim to be 1')
17-
18-
# atleast_1d for ty
19-
self._value = np.atleast_1d(value)
8+
if isinstance(value, list):
9+
dim = len(value)
10+
try:
11+
self._value = struct.pack(f'>HH{dim}e', dim, 0, *value)
12+
except struct.error as e:
13+
raise ValueError('expected list[float]')
14+
elif isinstance(value, np.ndarray):
15+
if value.ndim != 1:
16+
raise ValueError('expected ndim to be 1')
17+
18+
# asarray still copies if same dtype
19+
if value.dtype != '>f2':
20+
value = np.asarray(value, dtype='>f2')
21+
22+
self._value = struct.pack('>HH', value.shape[0], 0) + value.tobytes()
23+
else:
24+
raise ValueError('expected list or ndarray')
2025

2126
def __repr__(self) -> str:
2227
return f'HalfVector({self.to_list()})'
2328

2429
def __eq__(self, other: object) -> bool:
2530
if isinstance(other, self.__class__):
26-
return np.array_equal(self.to_numpy(), other.to_numpy())
31+
return self.to_binary() == other.to_binary()
2732
return False
2833

2934
def dimensions(self) -> int:
30-
return len(self._value)
35+
dim, = struct.unpack_from('>H', self._value)
36+
return dim
3137

3238
def to_list(self) -> list[float]:
33-
return self._value.tolist()
39+
return list(struct.unpack_from(f'>{self.dimensions()}e', self._value[4:]))
3440

3541
def to_numpy(self) -> np.ndarray[tuple[int], np.dtype[np.float16]]:
3642
# TODO return native endian
37-
return self._value
43+
return np.frombuffer(self._value, dtype='>f2', count=self.dimensions(), offset=4)
3844

3945
def to_text(self) -> str:
40-
return '[' + ','.join([str(float(v)) for v in self._value]) + ']'
46+
return '[' + ','.join([str(v) for v in self.to_list()]) + ']'
4147

4248
def to_binary(self) -> bytes:
43-
return pack('>HH', self.dimensions(), 0) + self._value.tobytes()
49+
return self._value
4450

4551
@classmethod
4652
def from_text(cls, value: str) -> HalfVector:
4753
return cls([float(v) for v in value[1:-1].split(',')])
4854

4955
@classmethod
5056
def from_binary(cls, value: bytes) -> HalfVector:
51-
dim, unused = unpack_from('>HH', value)
52-
return cls(np.frombuffer(value, dtype='>f2', count=dim, offset=4))
57+
# TODO check dimensions/length and unused
58+
vec = cls.__new__(cls)
59+
vec._value = value
60+
return vec
5361

5462
@classmethod
5563
def _to_db(cls, value: object, dim: int | None = None) -> str | None:

pgvector/vector.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,63 @@
11
from __future__ import annotations
22
import numpy as np
3-
from struct import pack, unpack_from
3+
import struct
44

55

66
class Vector:
77
def __init__(self, value: list[float] | np.ndarray[tuple[int], np.dtype[np.floating]]) -> None:
8-
# asarray still copies if same dtype
9-
if not isinstance(value, np.ndarray) or value.dtype != '>f4':
10-
value = np.asarray(value, dtype='>f4')
11-
12-
# for mypy
13-
assert isinstance(value, np.ndarray)
14-
15-
if value.ndim != 1:
16-
raise ValueError('expected ndim to be 1')
17-
18-
# atleast_1d for ty
19-
self._value = np.atleast_1d(value)
8+
if isinstance(value, list):
9+
dim = len(value)
10+
try:
11+
self._value = struct.pack(f'>HH{dim}f', dim, 0, *value)
12+
except struct.error as e:
13+
raise ValueError('expected list[float]')
14+
elif isinstance(value, np.ndarray):
15+
if value.ndim != 1:
16+
raise ValueError('expected ndim to be 1')
17+
18+
# asarray still copies if same dtype
19+
if value.dtype != '>f4':
20+
value = np.asarray(value, dtype='>f4')
21+
22+
self._value = struct.pack('>HH', value.shape[0], 0) + value.tobytes()
23+
else:
24+
raise ValueError('expected list or ndarray')
2025

2126
def __repr__(self) -> str:
2227
return f'Vector({self.to_list()})'
2328

2429
def __eq__(self, other: object) -> bool:
2530
if isinstance(other, self.__class__):
26-
return np.array_equal(self.to_numpy(), other.to_numpy())
31+
return self.to_binary() == other.to_binary()
2732
return False
2833

2934
def dimensions(self) -> int:
30-
return len(self._value)
35+
dim, = struct.unpack_from('>H', self._value)
36+
return dim
3137

3238
def to_list(self) -> list[float]:
33-
return self._value.tolist()
39+
return list(struct.unpack_from(f'>{self.dimensions()}f', self._value[4:]))
3440

3541
def to_numpy(self) -> np.ndarray[tuple[int], np.dtype[np.float32]]:
3642
# TODO return native endian
37-
return self._value
43+
return np.frombuffer(self._value, dtype='>f4', count=self.dimensions(), offset=4)
3844

3945
def to_text(self) -> str:
40-
return '[' + ','.join([str(float(v)) for v in self._value]) + ']'
46+
return '[' + ','.join([str(v) for v in self.to_list()]) + ']'
4147

4248
def to_binary(self) -> bytes:
43-
return pack('>HH', self.dimensions(), 0) + self._value.tobytes()
49+
return self._value
4450

4551
@classmethod
4652
def from_text(cls, value: str) -> Vector:
4753
return cls([float(v) for v in value[1:-1].split(',')])
4854

4955
@classmethod
5056
def from_binary(cls, value: bytes) -> Vector:
51-
dim, unused = unpack_from('>HH', value)
52-
return cls(np.frombuffer(value, dtype='>f4', count=dim, offset=4))
57+
# TODO check dimensions/length and unused
58+
vec = cls.__new__(cls)
59+
vec._value = value
60+
return vec
5361

5462
@classmethod
5563
def _to_db(cls, value: object, dim: int | None = None) -> str | None:

tests/test_half_vector.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,24 @@ def test_list(self):
99
assert HalfVector([1, 2, 3]).to_list() == [1, 2, 3]
1010

1111
def test_list_str(self):
12-
with pytest.raises(ValueError, match='could not convert string to float'):
12+
with pytest.raises(ValueError) as error:
1313
HalfVector([1, 'two', 3]) # ty: ignore[invalid-argument-type]
14+
assert str(error.value) == 'expected list[float]'
1415

1516
def test_ndarray(self):
1617
arr = np.array([1, 2, 3])
1718
assert HalfVector(arr).to_list() == [1, 2, 3]
1819
assert HalfVector(arr).to_numpy() is not arr
1920

20-
def test_ndarray_same_object(self):
21-
arr = np.array([1, 2, 3], dtype='>f2')
22-
assert HalfVector(arr).to_list() == [1, 2, 3]
23-
assert HalfVector(arr).to_numpy() is arr
24-
2521
def test_ndim_two(self):
2622
with pytest.raises(ValueError) as error:
2723
HalfVector([[1, 2], [3, 4]]) # ty: ignore[invalid-argument-type]
28-
assert str(error.value) == 'expected ndim to be 1'
24+
assert str(error.value) == 'expected list[float]'
2925

3026
def test_ndim_zero(self):
3127
with pytest.raises(ValueError) as error:
3228
HalfVector(1) # ty: ignore[invalid-argument-type]
33-
assert str(error.value) == 'expected ndim to be 1'
29+
assert str(error.value) == 'expected list or ndarray'
3430

3531
def test_repr(self):
3632
assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])'

tests/test_vector.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,24 @@ def test_list(self):
99
assert Vector([1, 2, 3]).to_list() == [1, 2, 3]
1010

1111
def test_list_str(self):
12-
with pytest.raises(ValueError, match='could not convert string to float'):
12+
with pytest.raises(ValueError) as error:
1313
Vector([1, 'two', 3]) # ty: ignore[invalid-argument-type]
14+
assert str(error.value) == 'expected list[float]'
1415

1516
def test_ndarray(self):
1617
arr = np.array([1, 2, 3])
1718
assert Vector(arr).to_list() == [1, 2, 3]
1819
assert Vector(arr).to_numpy() is not arr
1920

20-
def test_ndarray_same_object(self):
21-
arr = np.array([1, 2, 3], dtype='>f4')
22-
assert Vector(arr).to_list() == [1, 2, 3]
23-
assert Vector(arr).to_numpy() is arr
24-
2521
def test_ndim_two(self):
2622
with pytest.raises(ValueError) as error:
2723
Vector([[1, 2], [3, 4]]) # ty: ignore[invalid-argument-type]
28-
assert str(error.value) == 'expected ndim to be 1'
24+
assert str(error.value) == 'expected list[float]'
2925

3026
def test_ndim_zero(self):
3127
with pytest.raises(ValueError) as error:
3228
Vector(1) # ty: ignore[invalid-argument-type]
33-
assert str(error.value) == 'expected ndim to be 1'
29+
assert str(error.value) == 'expected list or ndarray'
3430

3531
def test_repr(self):
3632
assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])'

0 commit comments

Comments
 (0)