diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 537fe540e7..df97444a17 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -58,6 +58,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: persist-credentials: false + submodules: recursive - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: ${{ matrix.python }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..8b7b6e8b71 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "iceberg-testing"] + path = iceberg-testing + url = https://github.com/sungwy/iceberg-testing.git diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 485579eff4..633f20e102 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -1,5 +1,7 @@ .github/** dev/.rat-excludes +.gitmodules +iceberg-testing/** uv.lock .ruff_cache/** .pytest_cache/** diff --git a/iceberg-testing b/iceberg-testing new file mode 160000 index 0000000000..1bd33d3b34 --- /dev/null +++ b/iceberg-testing @@ -0,0 +1 @@ +Subproject commit 1bd33d3b3410d214ab942b704b35ed0cac44d951 diff --git a/tests/conformance/_shared.py b/tests/conformance/_shared.py new file mode 100644 index 0000000000..0c28ae545d --- /dev/null +++ b/tests/conformance/_shared.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Shared helpers for the iceberg-testing conformance suite. + +The cross-language fixtures are language-neutral: a static input plus the expected +result the spec fixes for it, pinned in the ``iceberg-testing/`` submodule. The +per-surface test modules walk them and run them against PyIceberg. Cases PyIceberg +does not satisfy yet are marked xfail in each module with a tracking issue; that +staged-adoption list is a consumer-side concern. The submodule must be checked out +(``git submodule update --init``); otherwise these tests skip. +""" + +import json +import os +from typing import Any + +import pytest + +ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +FIXTURES = os.path.join(ROOT, "iceberg-testing", "table-spec") +FIXTURES_PRESENT = os.path.isdir(FIXTURES) + +skip_if_absent = pytest.mark.skipif( + not FIXTURES_PRESENT, + reason="iceberg-testing submodule not initialized (git submodule update --init --recursive)", +) + + +def load_jsonl(path: str) -> list[dict[str, Any]]: + with open(path) as handle: + return [json.loads(line) for line in handle if line.strip()] diff --git a/tests/conformance/test_bucket_hash.py b/tests/conformance/test_bucket_hash.py new file mode 100644 index 0000000000..b26824de11 --- /dev/null +++ b/tests/conformance/test_bucket_hash.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Bucket-transform conformance: the Appendix B 32-bit hash that bucket[N] builds on.""" + +import os +import uuid as uuidmod +from decimal import Decimal +from typing import Any + +import pytest +from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent + +from pyiceberg.transforms import BucketTransform +from pyiceberg.types import PrimitiveType +from pyiceberg.utils.datetime import date_str_to_days, time_str_to_micros, timestamp_to_micros + +pytestmark = skip_if_absent + +# PyIceberg under-sizes the minimal two's-complement of byte-boundary decimals, so it +# hashes (and buckets) these to a different value than the spec fixes. Tracked in the +# open issue below; remove these entries when it lands and the cases XPASS. +BUCKET_XFAIL = { + "hash-decimal-neg-1.28": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", + "hash-decimal-neg-327.68": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", + "hash-decimal-neg-83886.08": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", +} +_BUCKET_N = 2_000_003 # large prime; comparing bucket[N] avoids needing the raw hash API + + +def _physical_value(type_str: str, value: str) -> Any: + """Convert a fixture's string value to the physical form PyIceberg buckets.""" + if type_str in ("int", "long"): + return int(value) + if type_str.startswith("decimal"): + return Decimal(value) + if type_str == "date": + return date_str_to_days(value) + if type_str == "time": + return time_str_to_micros(value) + if type_str == "timestamp": + return timestamp_to_micros(value) + if type_str == "string": + return value + if type_str == "uuid": + return uuidmod.UUID(value) + if type_str.startswith("fixed") or type_str == "binary": + return bytes.fromhex(value) + raise ValueError(type_str) + + +def _bucket_cases() -> list[Any]: + if not FIXTURES_PRESENT: + return [] + out: list[Any] = [] + for case in load_jsonl(os.path.join(FIXTURES, "transforms", "bucket", "cases.jsonl")): + marks = [pytest.mark.xfail(reason=BUCKET_XFAIL[case["id"]], strict=False)] if case["id"] in BUCKET_XFAIL else [] + out.append(pytest.param(case, id=case["id"], marks=marks)) + return out + + +@pytest.mark.parametrize("case", _bucket_cases()) +def test_bucket_hash(case: dict[str, Any]) -> None: + source_type = PrimitiveType.model_validate(case["type"]) + bucket = BucketTransform(num_buckets=_BUCKET_N).transform(source_type) + got = bucket(_physical_value(case["type"], case["value"])) + assert got == (case["hash"] & 0x7FFFFFFF) % _BUCKET_N diff --git a/tests/conformance/test_delete_formats.py b/tests/conformance/test_delete_formats.py new file mode 100644 index 0000000000..1fb0500418 --- /dev/null +++ b/tests/conformance/test_delete_formats.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Delete-file conformance: decode positional and equality delete files by field-id.""" + +import glob +import json +import os +from typing import Any + +import pyarrow.parquet as pq +import pytest +from _shared import FIXTURES, skip_if_absent + +pytestmark = skip_if_absent + + +def _encode(value: Any) -> Any: + if value is None: + return None + if isinstance(value, int): + return str(value) + if isinstance(value, bytes): + return value.hex().upper() + return value + + +def _decode(path: str) -> list[dict[str, Any]]: + table = pq.read_table(path) + ids = {field.name: int((field.metadata or {})[b"PARQUET:field_id"]) for field in table.schema} + cols = {name: table.column(name).to_pylist() for name in table.column_names} + return [{str(ids[name]): _encode(cols[name][row]) for name in table.column_names} for row in range(table.num_rows)] + + +@pytest.mark.parametrize( + "sidecar", + sorted(glob.glob(os.path.join(FIXTURES, "delete-formats", "*", "*.parquet.expected.json"))), + ids=lambda p: os.path.basename(os.path.dirname(p)), +) +def test_delete_file_decode(sidecar: str) -> None: + with open(sidecar) as handle: + expected = json.load(handle) + data_file = os.path.join(os.path.dirname(sidecar), expected["delete-file"]) + assert _decode(data_file) == expected["decoded-rows"] diff --git a/tests/conformance/test_type_strings.py b/tests/conformance/test_type_strings.py new file mode 100644 index 0000000000..874a73f89d --- /dev/null +++ b/tests/conformance/test_type_strings.py @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Type-string conformance: parse the input and re-serialize to the canonical form.""" + +import glob +import os +from typing import Any + +import pytest +from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent + +from pyiceberg.exceptions import ValidationError +from pyiceberg.types import DecimalType, FixedType, PrimitiveType + +pytestmark = skip_if_absent + +# Cases PyIceberg does not satisfy yet (consumer-side staged-adoption list). Each +# reason links the open issue that tracks the divergence; when it lands the case +# passes (reported XPASS, since these xfails are non-strict) and the entry here can +# be removed. fixed[...] whitespace has no dedicated issue; it is the same +# type-string-whitespace class as the decimal case. +TYPE_XFAIL = { + "decimal-space-around-params": "rejects optional whitespace around decimal params; apache/iceberg#16798", + "fixed-space-around": "rejects whitespace inside fixed[...] brackets; type-string whitespace, cf. apache/iceberg#16798", + "geometry-default": "geometry type not supported yet; apache/iceberg-python#3004", + "geometry-unquoted-crs": "geometry type not supported yet; apache/iceberg-python#3004", + "decimal-precision-over-max": "does not reject decimal precision > 38 (spec: precision must be 38 or less; Java rejects it)", +} + + +def _type_fields(parsed: PrimitiveType) -> dict[str, int]: + if isinstance(parsed, DecimalType): + return {"precision": parsed.precision, "scale": parsed.scale} + if isinstance(parsed, FixedType): + return {"length": parsed.root} + return {} + + +def _type_cases(*, accept: bool) -> list[Any]: + if not FIXTURES_PRESENT: + return [] + out: list[Any] = [] + for path in sorted(glob.glob(os.path.join(FIXTURES, "types", "*", "cases.jsonl"))): + for case in load_jsonl(path): + if case["accept"] is not accept: + continue + marks = [pytest.mark.xfail(reason=TYPE_XFAIL[case["id"]], strict=False)] if case["id"] in TYPE_XFAIL else [] + out.append(pytest.param(case, id=case["id"], marks=marks)) + return out + + +@pytest.mark.parametrize("case", _type_cases(accept=True)) +def test_type_string_accepted(case: dict[str, Any]) -> None: + parsed = PrimitiveType.model_validate(case["input"]) + assert _type_fields(parsed) == case["parsed"] + assert str(parsed) == case["canonical"] + + +@pytest.mark.parametrize("case", _type_cases(accept=False)) +def test_type_string_rejected(case: dict[str, Any]) -> None: + with pytest.raises(ValidationError): + PrimitiveType.model_validate(case["input"])