Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jobs:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
persist-credentials: false
submodules: recursive
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: ${{ matrix.python }}
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "iceberg-testing"]
path = iceberg-testing
url = https://github.com/sungwy/iceberg-testing.git
2 changes: 2 additions & 0 deletions dev/.rat-excludes
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.github/**
dev/.rat-excludes
.gitmodules
iceberg-testing/**
uv.lock
.ruff_cache/**
.pytest_cache/**
Expand Down
1 change: 1 addition & 0 deletions iceberg-testing
Submodule iceberg-testing added at 1bd33d
45 changes: 45 additions & 0 deletions tests/conformance/_shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Shared helpers for the iceberg-testing conformance suite.

The cross-language fixtures are language-neutral: a static input plus the expected
result the spec fixes for it, pinned in the ``iceberg-testing/`` submodule. The
per-surface test modules walk them and run them against PyIceberg. Cases PyIceberg
does not satisfy yet are marked xfail in each module with a tracking issue; that
staged-adoption list is a consumer-side concern. The submodule must be checked out
(``git submodule update --init``); otherwise these tests skip.
"""

import json
import os
from typing import Any

import pytest

ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
FIXTURES = os.path.join(ROOT, "iceberg-testing", "table-spec")
FIXTURES_PRESENT = os.path.isdir(FIXTURES)

skip_if_absent = pytest.mark.skipif(
not FIXTURES_PRESENT,
reason="iceberg-testing submodule not initialized (git submodule update --init --recursive)",
)


def load_jsonl(path: str) -> list[dict[str, Any]]:
with open(path) as handle:
return [json.loads(line) for line in handle if line.strip()]
80 changes: 80 additions & 0 deletions tests/conformance/test_bucket_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Bucket-transform conformance: the Appendix B 32-bit hash that bucket[N] builds on."""

import os
import uuid as uuidmod
from decimal import Decimal
from typing import Any

import pytest
from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent

from pyiceberg.transforms import BucketTransform
from pyiceberg.types import PrimitiveType
from pyiceberg.utils.datetime import date_str_to_days, time_str_to_micros, timestamp_to_micros

pytestmark = skip_if_absent

# PyIceberg under-sizes the minimal two's-complement of byte-boundary decimals, so it
# hashes (and buckets) these to a different value than the spec fixes. Tracked in the
# open issue below; remove these entries when it lands and the cases XPASS.
BUCKET_XFAIL = {
"hash-decimal-neg-1.28": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522",
"hash-decimal-neg-327.68": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522",
"hash-decimal-neg-83886.08": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522",
}
_BUCKET_N = 2_000_003 # large prime; comparing bucket[N] avoids needing the raw hash API


def _physical_value(type_str: str, value: str) -> Any:
"""Convert a fixture's string value to the physical form PyIceberg buckets."""
if type_str in ("int", "long"):
return int(value)
if type_str.startswith("decimal"):
return Decimal(value)
if type_str == "date":
return date_str_to_days(value)
if type_str == "time":
return time_str_to_micros(value)
if type_str == "timestamp":
return timestamp_to_micros(value)
if type_str == "string":
return value
if type_str == "uuid":
return uuidmod.UUID(value)
if type_str.startswith("fixed") or type_str == "binary":
return bytes.fromhex(value)
raise ValueError(type_str)


def _bucket_cases() -> list[Any]:
if not FIXTURES_PRESENT:
return []
out: list[Any] = []
for case in load_jsonl(os.path.join(FIXTURES, "transforms", "bucket", "cases.jsonl")):
marks = [pytest.mark.xfail(reason=BUCKET_XFAIL[case["id"]], strict=False)] if case["id"] in BUCKET_XFAIL else []
out.append(pytest.param(case, id=case["id"], marks=marks))
return out


@pytest.mark.parametrize("case", _bucket_cases())
def test_bucket_hash(case: dict[str, Any]) -> None:
source_type = PrimitiveType.model_validate(case["type"])
bucket = BucketTransform(num_buckets=_BUCKET_N).transform(source_type)
got = bucket(_physical_value(case["type"], case["value"]))
assert got == (case["hash"] & 0x7FFFFFFF) % _BUCKET_N
57 changes: 57 additions & 0 deletions tests/conformance/test_delete_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Delete-file conformance: decode positional and equality delete files by field-id."""

import glob
import json
import os
from typing import Any

import pyarrow.parquet as pq
import pytest
from _shared import FIXTURES, skip_if_absent

pytestmark = skip_if_absent


def _encode(value: Any) -> Any:
if value is None:
return None
if isinstance(value, int):
return str(value)
if isinstance(value, bytes):
return value.hex().upper()
return value


def _decode(path: str) -> list[dict[str, Any]]:
table = pq.read_table(path)
ids = {field.name: int((field.metadata or {})[b"PARQUET:field_id"]) for field in table.schema}
cols = {name: table.column(name).to_pylist() for name in table.column_names}
return [{str(ids[name]): _encode(cols[name][row]) for name in table.column_names} for row in range(table.num_rows)]


@pytest.mark.parametrize(
"sidecar",
sorted(glob.glob(os.path.join(FIXTURES, "delete-formats", "*", "*.parquet.expected.json"))),
ids=lambda p: os.path.basename(os.path.dirname(p)),
)
def test_delete_file_decode(sidecar: str) -> None:
with open(sidecar) as handle:
expected = json.load(handle)
data_file = os.path.join(os.path.dirname(sidecar), expected["delete-file"])
assert _decode(data_file) == expected["decoded-rows"]
76 changes: 76 additions & 0 deletions tests/conformance/test_type_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Type-string conformance: parse the input and re-serialize to the canonical form."""

import glob
import os
from typing import Any

import pytest
from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent

from pyiceberg.exceptions import ValidationError
from pyiceberg.types import DecimalType, FixedType, PrimitiveType

pytestmark = skip_if_absent

# Cases PyIceberg does not satisfy yet (consumer-side staged-adoption list). Each
# reason links the open issue that tracks the divergence; when it lands the case
# passes (reported XPASS, since these xfails are non-strict) and the entry here can
# be removed. fixed[...] whitespace has no dedicated issue; it is the same
# type-string-whitespace class as the decimal case.
TYPE_XFAIL = {
"decimal-space-around-params": "rejects optional whitespace around decimal params; apache/iceberg#16798",
"fixed-space-around": "rejects whitespace inside fixed[...] brackets; type-string whitespace, cf. apache/iceberg#16798",
"geometry-default": "geometry type not supported yet; apache/iceberg-python#3004",
"geometry-unquoted-crs": "geometry type not supported yet; apache/iceberg-python#3004",
"decimal-precision-over-max": "does not reject decimal precision > 38 (spec: precision must be 38 or less; Java rejects it)",
}


def _type_fields(parsed: PrimitiveType) -> dict[str, int]:
if isinstance(parsed, DecimalType):
return {"precision": parsed.precision, "scale": parsed.scale}
if isinstance(parsed, FixedType):
return {"length": parsed.root}
return {}


def _type_cases(*, accept: bool) -> list[Any]:
if not FIXTURES_PRESENT:
return []
out: list[Any] = []
for path in sorted(glob.glob(os.path.join(FIXTURES, "types", "*", "cases.jsonl"))):
for case in load_jsonl(path):
if case["accept"] is not accept:
continue
marks = [pytest.mark.xfail(reason=TYPE_XFAIL[case["id"]], strict=False)] if case["id"] in TYPE_XFAIL else []
out.append(pytest.param(case, id=case["id"], marks=marks))
return out


@pytest.mark.parametrize("case", _type_cases(accept=True))
def test_type_string_accepted(case: dict[str, Any]) -> None:
parsed = PrimitiveType.model_validate(case["input"])
assert _type_fields(parsed) == case["parsed"]
assert str(parsed) == case["canonical"]


@pytest.mark.parametrize("case", _type_cases(accept=False))
def test_type_string_rejected(case: dict[str, Any]) -> None:
with pytest.raises(ValidationError):
PrimitiveType.model_validate(case["input"])