Skip to content

Commit e882ba8

Browse files
committed
Support zstd blob decompression in Puffin
1 parent 41276a3 commit e882ba8

3 files changed

Lines changed: 31 additions & 1 deletion

File tree

pyiceberg/table/puffin.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# under the License.
1717
from typing import TYPE_CHECKING
1818

19+
import zstandard
1920
from pydantic import Field
2021

2122
from pyiceberg.typedef import IcebergBaseModel
@@ -68,7 +69,12 @@ def __init__(self, puffin: bytes) -> None:
6869
self._file_bytes = puffin
6970

7071
def get_blob_payload(self, blob: PuffinBlobMetadata) -> bytes:
71-
return self._file_bytes[blob.offset : blob.offset + blob.length]
72+
raw = self._file_bytes[blob.offset : blob.offset + blob.length]
73+
if blob.compression_codec is None:
74+
return raw
75+
if blob.compression_codec == "zstd":
76+
return zstandard.ZstdDecompressor().decompress(raw)
77+
raise ValueError(f"Unsupported compression codec: {blob.compression_codec!r}")
7278

7379
@deprecated(deprecated_in="0.12.0", removed_in="0.13.0", help_message="Use deletion_vectors_from_puffin_file(...) instead")
7480
def to_vector(self) -> dict[str, "pa.ChunkedArray"]:
417 Bytes
Binary file not shown.

tests/table/test_puffin.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,30 @@ def test_read_empty_uncompressed() -> None:
3333
assert pf.footer.properties == {}
3434

3535

36+
def test_read_compressed_zstd() -> None:
37+
puffin_bytes = _open_file("v1/sample-metric-data-compressed-zstd.bin")
38+
pf = PuffinFile(puffin_bytes)
39+
40+
assert pf.footer.properties == {"created-by": "Test 1234"}
41+
assert len(pf.footer.blobs) == 2
42+
43+
blob1 = pf.footer.blobs[0]
44+
assert blob1.type == "some-blob"
45+
assert blob1.fields == [1]
46+
assert blob1.snapshot_id == 2
47+
assert blob1.sequence_number == 1
48+
assert blob1.compression_codec == "zstd"
49+
assert pf.get_blob_payload(blob1) == b"abcdefghi"
50+
51+
blob2 = pf.footer.blobs[1]
52+
assert blob2.type == "some-other-blob"
53+
assert blob2.fields == [2]
54+
assert blob2.compression_codec == "zstd"
55+
assert pf.get_blob_payload(blob2) == (
56+
b"some blob \x00 binary data \xf0\x9f\xa4\xaf that is not very very very very very very long, is it?"
57+
)
58+
59+
3660
def test_read_two_blobs_uncompressed() -> None:
3761
puffin_bytes = _open_file("v1/sample-metric-data-uncompressed.bin")
3862
pf = PuffinFile(puffin_bytes)

0 commit comments

Comments
 (0)