From 125cbc903e19fdbad7b9775780ca1e6f4fc42209 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 26 Jun 2026 17:03:16 -0400 Subject: [PATCH] Add iceberg-testing conformance suite (POC) Pin sungwy/iceberg-testing as a git submodule and add a pytest conformance suite that walks its language-neutral fixtures against PyIceberg: - test_type_strings.py parse + exact re-serialize (canonical type strings) - test_bucket_hash.py the Appendix B 32-bit hash Known-Answer-Test - test_delete_formats.py positional/equality delete-file decode by field-id - _shared.py fixtures path, submodule-absent skip marker, load_jsonl Cases PyIceberg does not yet satisfy are xfail with the tracking issue (apache/iceberg#16798, apache/iceberg-python#3004, apache/iceberg-python#3522); removing an entry when it lands shows up as an XPASS. The lint-and-unit-test CI job checks out the submodule so the suite runs, and the tests skip cleanly when it is absent. .gitmodules and the submodule are excluded from the RAT license check. 19 pass, 6 xfail. --- .github/workflows/python-ci.yml | 1 + .gitmodules | 3 + dev/.rat-excludes | 2 + iceberg-testing | 1 + tests/conformance/_shared.py | 45 +++++++++++++ tests/conformance/test_bucket_hash.py | 80 ++++++++++++++++++++++++ tests/conformance/test_delete_formats.py | 57 +++++++++++++++++ tests/conformance/test_type_strings.py | 76 ++++++++++++++++++++++ 8 files changed, 265 insertions(+) create mode 100644 .gitmodules create mode 160000 iceberg-testing create mode 100644 tests/conformance/_shared.py create mode 100644 tests/conformance/test_bucket_hash.py create mode 100644 tests/conformance/test_delete_formats.py create mode 100644 tests/conformance/test_type_strings.py diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 537fe540e7..df97444a17 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -58,6 +58,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: persist-credentials: false + submodules: recursive - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: ${{ matrix.python }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..8b7b6e8b71 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "iceberg-testing"] + path = iceberg-testing + url = https://github.com/sungwy/iceberg-testing.git diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 485579eff4..633f20e102 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -1,5 +1,7 @@ .github/** dev/.rat-excludes +.gitmodules +iceberg-testing/** uv.lock .ruff_cache/** .pytest_cache/** diff --git a/iceberg-testing b/iceberg-testing new file mode 160000 index 0000000000..1bd33d3b34 --- /dev/null +++ b/iceberg-testing @@ -0,0 +1 @@ +Subproject commit 1bd33d3b3410d214ab942b704b35ed0cac44d951 diff --git a/tests/conformance/_shared.py b/tests/conformance/_shared.py new file mode 100644 index 0000000000..0c28ae545d --- /dev/null +++ b/tests/conformance/_shared.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Shared helpers for the iceberg-testing conformance suite. + +The cross-language fixtures are language-neutral: a static input plus the expected +result the spec fixes for it, pinned in the ``iceberg-testing/`` submodule. The +per-surface test modules walk them and run them against PyIceberg. Cases PyIceberg +does not satisfy yet are marked xfail in each module with a tracking issue; that +staged-adoption list is a consumer-side concern. The submodule must be checked out +(``git submodule update --init``); otherwise these tests skip. +""" + +import json +import os +from typing import Any + +import pytest + +ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +FIXTURES = os.path.join(ROOT, "iceberg-testing", "table-spec") +FIXTURES_PRESENT = os.path.isdir(FIXTURES) + +skip_if_absent = pytest.mark.skipif( + not FIXTURES_PRESENT, + reason="iceberg-testing submodule not initialized (git submodule update --init --recursive)", +) + + +def load_jsonl(path: str) -> list[dict[str, Any]]: + with open(path) as handle: + return [json.loads(line) for line in handle if line.strip()] diff --git a/tests/conformance/test_bucket_hash.py b/tests/conformance/test_bucket_hash.py new file mode 100644 index 0000000000..b26824de11 --- /dev/null +++ b/tests/conformance/test_bucket_hash.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Bucket-transform conformance: the Appendix B 32-bit hash that bucket[N] builds on.""" + +import os +import uuid as uuidmod +from decimal import Decimal +from typing import Any + +import pytest +from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent + +from pyiceberg.transforms import BucketTransform +from pyiceberg.types import PrimitiveType +from pyiceberg.utils.datetime import date_str_to_days, time_str_to_micros, timestamp_to_micros + +pytestmark = skip_if_absent + +# PyIceberg under-sizes the minimal two's-complement of byte-boundary decimals, so it +# hashes (and buckets) these to a different value than the spec fixes. Tracked in the +# open issue below; remove these entries when it lands and the cases XPASS. +BUCKET_XFAIL = { + "hash-decimal-neg-1.28": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", + "hash-decimal-neg-327.68": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", + "hash-decimal-neg-83886.08": "minimal-byte decimal serialization differs from spec; apache/iceberg-python#3522", +} +_BUCKET_N = 2_000_003 # large prime; comparing bucket[N] avoids needing the raw hash API + + +def _physical_value(type_str: str, value: str) -> Any: + """Convert a fixture's string value to the physical form PyIceberg buckets.""" + if type_str in ("int", "long"): + return int(value) + if type_str.startswith("decimal"): + return Decimal(value) + if type_str == "date": + return date_str_to_days(value) + if type_str == "time": + return time_str_to_micros(value) + if type_str == "timestamp": + return timestamp_to_micros(value) + if type_str == "string": + return value + if type_str == "uuid": + return uuidmod.UUID(value) + if type_str.startswith("fixed") or type_str == "binary": + return bytes.fromhex(value) + raise ValueError(type_str) + + +def _bucket_cases() -> list[Any]: + if not FIXTURES_PRESENT: + return [] + out: list[Any] = [] + for case in load_jsonl(os.path.join(FIXTURES, "transforms", "bucket", "cases.jsonl")): + marks = [pytest.mark.xfail(reason=BUCKET_XFAIL[case["id"]], strict=False)] if case["id"] in BUCKET_XFAIL else [] + out.append(pytest.param(case, id=case["id"], marks=marks)) + return out + + +@pytest.mark.parametrize("case", _bucket_cases()) +def test_bucket_hash(case: dict[str, Any]) -> None: + source_type = PrimitiveType.model_validate(case["type"]) + bucket = BucketTransform(num_buckets=_BUCKET_N).transform(source_type) + got = bucket(_physical_value(case["type"], case["value"])) + assert got == (case["hash"] & 0x7FFFFFFF) % _BUCKET_N diff --git a/tests/conformance/test_delete_formats.py b/tests/conformance/test_delete_formats.py new file mode 100644 index 0000000000..1fb0500418 --- /dev/null +++ b/tests/conformance/test_delete_formats.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Delete-file conformance: decode positional and equality delete files by field-id.""" + +import glob +import json +import os +from typing import Any + +import pyarrow.parquet as pq +import pytest +from _shared import FIXTURES, skip_if_absent + +pytestmark = skip_if_absent + + +def _encode(value: Any) -> Any: + if value is None: + return None + if isinstance(value, int): + return str(value) + if isinstance(value, bytes): + return value.hex().upper() + return value + + +def _decode(path: str) -> list[dict[str, Any]]: + table = pq.read_table(path) + ids = {field.name: int((field.metadata or {})[b"PARQUET:field_id"]) for field in table.schema} + cols = {name: table.column(name).to_pylist() for name in table.column_names} + return [{str(ids[name]): _encode(cols[name][row]) for name in table.column_names} for row in range(table.num_rows)] + + +@pytest.mark.parametrize( + "sidecar", + sorted(glob.glob(os.path.join(FIXTURES, "delete-formats", "*", "*.parquet.expected.json"))), + ids=lambda p: os.path.basename(os.path.dirname(p)), +) +def test_delete_file_decode(sidecar: str) -> None: + with open(sidecar) as handle: + expected = json.load(handle) + data_file = os.path.join(os.path.dirname(sidecar), expected["delete-file"]) + assert _decode(data_file) == expected["decoded-rows"] diff --git a/tests/conformance/test_type_strings.py b/tests/conformance/test_type_strings.py new file mode 100644 index 0000000000..874a73f89d --- /dev/null +++ b/tests/conformance/test_type_strings.py @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Type-string conformance: parse the input and re-serialize to the canonical form.""" + +import glob +import os +from typing import Any + +import pytest +from _shared import FIXTURES, FIXTURES_PRESENT, load_jsonl, skip_if_absent + +from pyiceberg.exceptions import ValidationError +from pyiceberg.types import DecimalType, FixedType, PrimitiveType + +pytestmark = skip_if_absent + +# Cases PyIceberg does not satisfy yet (consumer-side staged-adoption list). Each +# reason links the open issue that tracks the divergence; when it lands the case +# passes (reported XPASS, since these xfails are non-strict) and the entry here can +# be removed. fixed[...] whitespace has no dedicated issue; it is the same +# type-string-whitespace class as the decimal case. +TYPE_XFAIL = { + "decimal-space-around-params": "rejects optional whitespace around decimal params; apache/iceberg#16798", + "fixed-space-around": "rejects whitespace inside fixed[...] brackets; type-string whitespace, cf. apache/iceberg#16798", + "geometry-default": "geometry type not supported yet; apache/iceberg-python#3004", + "geometry-unquoted-crs": "geometry type not supported yet; apache/iceberg-python#3004", + "decimal-precision-over-max": "does not reject decimal precision > 38 (spec: precision must be 38 or less; Java rejects it)", +} + + +def _type_fields(parsed: PrimitiveType) -> dict[str, int]: + if isinstance(parsed, DecimalType): + return {"precision": parsed.precision, "scale": parsed.scale} + if isinstance(parsed, FixedType): + return {"length": parsed.root} + return {} + + +def _type_cases(*, accept: bool) -> list[Any]: + if not FIXTURES_PRESENT: + return [] + out: list[Any] = [] + for path in sorted(glob.glob(os.path.join(FIXTURES, "types", "*", "cases.jsonl"))): + for case in load_jsonl(path): + if case["accept"] is not accept: + continue + marks = [pytest.mark.xfail(reason=TYPE_XFAIL[case["id"]], strict=False)] if case["id"] in TYPE_XFAIL else [] + out.append(pytest.param(case, id=case["id"], marks=marks)) + return out + + +@pytest.mark.parametrize("case", _type_cases(accept=True)) +def test_type_string_accepted(case: dict[str, Any]) -> None: + parsed = PrimitiveType.model_validate(case["input"]) + assert _type_fields(parsed) == case["parsed"] + assert str(parsed) == case["canonical"] + + +@pytest.mark.parametrize("case", _type_cases(accept=False)) +def test_type_string_rejected(case: dict[str, Any]) -> None: + with pytest.raises(ValidationError): + PrimitiveType.model_validate(case["input"])