From 1eccfc1dc880e4b2e297565aae157d3397895e45 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com> Date: Tue, 30 Jun 2026 22:52:13 +0200 Subject: [PATCH] Fix Iceberg read optimization returning NULLs for stats-less manifests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an Iceberg manifest carries no per-column statistics, the parsed `DataFileMetaInfo::columns_info` is empty. The read optimization in `StorageObjectStorageSource::createReader` misread this as "every requested column is absent from the file": it replaced each nullable column with a constant `NULL` and set `need_only_count`, so the reader returned correct row counts but all-NULL values — silent data loss. Gate the absent-column-to-NULL loop on a non-empty `columns_info` so that stats-less manifests fall through to the regular reader, which reads present columns normally and resolves schema-evolution-absent columns via `IcebergMetadata::getInitialSchemaByPath`. Affects `icebergLocal`, `icebergS3`, `icebergAzure`, `icebergHDFS` and their `*Cluster` variants. Antalya-only, introduced by #1069. Add stateless test `04302_iceberg_read_optimization_no_column_stats` with a checked-in stats-less Iceberg fixture and a `generate.py` that reproduces it. C++ change taken from https://github.com/Altinity/ClickHouse/pull/1895 Closes #1545 Co-Authored-By: Claude Opus 4.8 --- .../StorageObjectStorageSource.cpp | 59 +++---- ...ead_optimization_no_column_stats.reference | 3 + ...berg_read_optimization_no_column_stats.sql | 7 + .../iceberg_no_column_stats/README.md | 29 ++++ ...d4e713-69c8-49f9-ab8e-f887af4bcecb.parquet | Bin 0 -> 1346 bytes .../iceberg_no_column_stats/generate.py | 145 ++++++++++++++++++ ...-1de2-40e5-8774-255dfdff698c.metadata.json | 1 + ...d4e713-69c8-49f9-ab8e-f887af4bcecb-m0.avro | Bin 0 -> 4303 bytes ...-39d4e713-69c8-49f9-ab8e-f887af4bcecb.avro | Bin 0 -> 1772 bytes 9 files changed, 216 insertions(+), 28 deletions(-) create mode 100644 tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.reference create mode 100644 tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.sql create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/README.md create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/data/00000-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.parquet create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/generate.py create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/00001-1347065c-1de2-40e5-8774-255dfdff698c.metadata.json create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/39d4e713-69c8-49f9-ab8e-f887af4bcecb-m0.avro create mode 100644 tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/snap-7564025723254944482-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.avro diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 332c4bcd09cf..e881bdfeaf21 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -905,40 +905,43 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade } } } - for (const auto & column : requested_columns_list) + if (!file_meta_data.value()->columns_info.empty()) { - const auto & column_name = column.first; + for (const auto & column : requested_columns_list) + { + const auto & column_name = column.first; - if (file_meta_data.value()->columns_info.contains(column_name)) - continue; + if (file_meta_data.value()->columns_info.contains(column_name)) + continue; - if (!column.second.second.type->isNullable()) - continue; + if (!column.second.second.type->isNullable()) + continue; - /// With View over Iceberg table we have someting like 'materialize(time)' as column_name - /// Simple cheap check - if (column_name.starts_with("materialize(") && column_name.ends_with(")")) - continue; + /// With View over Iceberg table we have someting like 'materialize(time)' as column_name + /// Simple cheap check + if (column_name.starts_with("materialize(") && column_name.ends_with(")")) + continue; - /// Skip columns produced by prewhere or row-level filter expressions — - /// they are computed at read time, not stored in the file. - if (format_filter_info - && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name) - || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name))) - continue; + /// Skip columns produced by prewhere or row-level filter expressions — + /// they are computed at read time, not stored in the file. + if (format_filter_info + && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name) + || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name))) + continue; - /// Column is nullable and absent in file - constant_columns_with_values[column.second.first] = - ConstColumnWithValue{ - column.second.second, - Field() - }; - constant_columns.insert(column_name); - - LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", - object_info->getPath(), - column_name, - column.second.second.type); + /// Column is nullable and absent in file + constant_columns_with_values[column.second.first] = + ConstColumnWithValue{ + column.second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + column.second.second.type); + } } } diff --git a/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.reference b/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.reference new file mode 100644 index 000000000000..6092186edaaf --- /dev/null +++ b/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.reference @@ -0,0 +1,3 @@ +1 alice 1.5 +2 bob 2.5 +3 carol 3.5 diff --git a/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.sql b/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.sql new file mode 100644 index 000000000000..447788ca8e6d --- /dev/null +++ b/tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.sql @@ -0,0 +1,7 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: Depends on S3/MinIO + +-- A stats-less Iceberg manifest (no per-column statistics) must return real +-- values, not NULLs: empty stats were misread as "all columns absent" (#1545). +SELECT * FROM icebergS3(s3_conn, filename = 'iceberg_no_column_stats') ORDER BY ALL +SETTINGS allow_experimental_iceberg_read_optimization = 1; diff --git a/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/README.md b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/README.md new file mode 100644 index 000000000000..d3914258df0d --- /dev/null +++ b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/README.md @@ -0,0 +1,29 @@ +## How this data is generated? + +A tiny Iceberg v2 table with three nullable columns and three rows: + +| id | name | value | +|----|-------|-------| +| 1 | alice | 1.5 | +| 2 | bob | 2.5 | +| 3 | carol | 3.5 | + +The point of this fixture is that its manifest is **stats-less**: the `data_file` +entry carries no per-column statistics at all (`column_sizes`, `value_counts`, +`null_value_counts`, `lower_bounds`, `upper_bounds` are all empty). Such manifests +are produced by writers that do not collect metrics. They left ClickHouse's +`DataFileMetaInfo::columns_info` empty, which the Iceberg read optimization used +to misread as "every column is absent", returning `NULL` for all of them. + +It is generated by `generate.py` (needs `pyiceberg`, `pyarrow`, `fastavro`): + +```bash +python3 generate.py +``` + +The script creates the table via `pyiceberg` with +`write.metadata.metrics.default = none`, then post-processes the manifest avro to +drop every per-column statistic and rewrites all internal paths to the stable +`s3a://test/iceberg_no_column_stats` prefix used by the test bucket. + +Used by `tests/queries/0_stateless/04302_iceberg_read_optimization_no_column_stats.sql`. diff --git a/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/data/00000-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.parquet b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/data/00000-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fcf707c09c173ebf2f6dcbbdfee666c90d6de74f GIT binary patch literal 1346 zcmb7E&1w@-6h1SZ3?XFEig%ciS!7{{3jK-26e)rBPMWl-4Q&FGY8H}7jA_WEiRn*T zmwgg9?p(O>0bB|$iVq;-%10=I;5qkZQrolz51F~=o}cgg?zv=kc0QkB{x3;du}ZxDX1Tcds5?{|ai3YK6dA`)gq)Zf9AWbPFn^D}}9 zyI7H{wkp^7>UA138fHT8VucPm9=%KdA3DIzH3!pg7 zDpg27(k%ycBqoyhkVHrlG#?b>TycJ?&Y5c8F!C`t5g%Yq#M69LwBOa*9cg)8Rw>1|+?lyUj^Cr(U zMa;#50(lwE^R(hVTyaOKwF8RR=_R2~2^u#pKOtz>qYU<=8&VD3jmlnaqa095wu*e4 z`m>io(#Q(~dFGM69l#O(rbTr1cExG z&)qzULf#kL9msiNzejOt*^Jajhm`Y`=NDNqI*BWy;eFs*9Nmkozm!@a_wIRNL6ffu zzPh|0K_@2zW}@f!-yAg#=jy%QoZs(v2Z`fG|De^~Pb@x}PtGT;H2yW^56` prefix. See README.md. Usage: generate.py . +""" +import json +import shutil +import sys +import tempfile +from pathlib import Path + +import fastavro +import pyarrow as pa +from pyiceberg.catalog.sql import SqlCatalog +from pyiceberg.schema import Schema +from pyiceberg.types import NestedField, LongType, StringType, DoubleType + +AVRO_RESERVED = {"avro.schema", "avro.codec"} + +# Per-column statistics on a manifest data_file entry (all optional); clearing +# them all is what makes the manifest stats-less. +STAT_FIELDS = ( + "column_sizes", + "value_counts", + "null_value_counts", + "nan_value_counts", + "lower_bounds", + "upper_bounds", +) + + +def deep_replace(obj, old, new): + if isinstance(obj, str): + return obj.replace(old, new) + if isinstance(obj, dict): + return {k: deep_replace(v, old, new) for k, v in obj.items()} + if isinstance(obj, list): + return [deep_replace(v, old, new) for v in obj] + return obj + + +def clear_stats(record): + df = record.get("data_file") + if isinstance(df, dict): + for field in STAT_FIELDS: + if field in df and df[field]: + df[field] = [] + return record + + +def rewrite_avro(src: Path, dst: Path, old: str, new: str, strip_stats: bool): + with open(src, "rb") as f: + reader = fastavro.reader(f) + schema = reader.writer_schema + meta = {k: v for k, v in reader.metadata.items() if k not in AVRO_RESERVED} + records = [deep_replace(r, old, new) for r in reader] + if strip_stats: + records = [clear_stats(r) for r in records] + with open(dst, "wb") as f: + fastavro.writer(f, schema, records, metadata=meta) + + +def main(out_dir: str): + work = Path(tempfile.mkdtemp(prefix="iceberg_gen_")) + warehouse = work / "warehouse" + warehouse.mkdir(parents=True) + + catalog = SqlCatalog( + "gen", + uri=f"sqlite:///{work}/catalog.db", + warehouse=f"file://{warehouse}", + ) + catalog.create_namespace("ns") + + schema = Schema( + NestedField(1, "id", LongType(), required=False), + NestedField(2, "name", StringType(), required=False), + NestedField(3, "value", DoubleType(), required=False), + ) + + # Reduces metrics, but pyiceberg still writes column_sizes (stripped below). + table = catalog.create_table( + "ns.no_stats", + schema=schema, + properties={"write.metadata.metrics.default": "none"}, + ) + + data = pa.table( + { + "id": pa.array([1, 2, 3], type=pa.int64()), + "name": pa.array(["alice", "bob", "carol"], type=pa.string()), + "value": pa.array([1.5, 2.5, 3.5], type=pa.float64()), + } + ) + table.append(data) + + table_location = Path(table.location().replace("file://", "")) + old_prefix = table.location() # file:///tmp/.../ns.db/no_stats + out = Path(out_dir) + new_prefix = f"s3a://test/{out.name}" # s3a://test/iceberg_no_column_stats + + if out.exists(): + shutil.rmtree(out) + (out / "metadata").mkdir(parents=True) + (out / "data").mkdir(parents=True) + + for f in (table_location / "data").rglob("*"): + if f.is_file(): + rel = f.relative_to(table_location / "data") + target = out / "data" / rel + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(f, target) + + meta_dir = table_location / "metadata" + + # Keep only the latest metadata.json (the post-append snapshot). The empty + # create-time version and the history logs that reference it aren't read. + latest_json = max( + (f for f in meta_dir.iterdir() if f.name.endswith(".metadata.json")), + key=lambda f: int(f.name.split("-", 1)[0]), + ) + meta = json.loads(latest_json.read_text().replace(old_prefix, new_prefix)) + meta["metadata-log"] = [] + meta["snapshot-log"] = [] + (out / "metadata" / latest_json.name).write_text(json.dumps(meta, separators=(",", ":"))) + + for f in meta_dir.iterdir(): + if f.name.endswith(".avro"): + # Only manifests carry data_file stats; the manifest list (snap-*) does not. + strip = not f.name.startswith("snap-") + rewrite_avro(f, out / "metadata" / f.name, old_prefix, new_prefix, strip) + + print(f"old prefix: {old_prefix}") + print(f"new prefix: {new_prefix}") + print(f"table uuid: {table.metadata.table_uuid}") + print(f"copied to: {out}") + shutil.rmtree(work) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + sys.exit(f"usage: {sys.argv[0]} ") + main(sys.argv[1]) diff --git a/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/00001-1347065c-1de2-40e5-8774-255dfdff698c.metadata.json b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/00001-1347065c-1de2-40e5-8774-255dfdff698c.metadata.json new file mode 100644 index 000000000000..892735e3b6b7 --- /dev/null +++ b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/00001-1347065c-1de2-40e5-8774-255dfdff698c.metadata.json @@ -0,0 +1 @@ +{"location":"s3a://test/iceberg_no_column_stats","table-uuid":"d0e63068-996d-41ee-9b7d-31f1fb17f1b1","last-updated-ms":1782851862557,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"id","type":"long","required":false},{"id":2,"name":"name","type":"string","required":false},{"id":3,"name":"value","type":"double","required":false}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":999,"properties":{"write.metadata.metrics.default":"none"},"current-snapshot-id":7564025723254944482,"snapshots":[{"snapshot-id":7564025723254944482,"sequence-number":1,"timestamp-ms":1782851862557,"manifest-list":"s3a://test/iceberg_no_column_stats/metadata/snap-7564025723254944482-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.avro","summary":{"operation":"append","added-files-size":"1346","added-data-files":"1","added-records":"3","total-data-files":"1","total-delete-files":"0","total-records":"3","total-files-size":"1346","total-position-deletes":"0","total-equality-deletes":"0"},"schema-id":0}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{"main":{"snapshot-id":7564025723254944482,"type":"branch"}},"statistics":[],"partition-statistics":[],"format-version":2,"last-sequence-number":1} \ No newline at end of file diff --git a/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/39d4e713-69c8-49f9-ab8e-f887af4bcecb-m0.avro b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/39d4e713-69c8-49f9-ab8e-f887af4bcecb-m0.avro new file mode 100644 index 0000000000000000000000000000000000000000..4b0b3acfe26752e564a86f7b5c6827e4ecf53192 GIT binary patch literal 4303 zcmcgvy>H}16yGcer9i2G>WYzRcaycfB)dz6#6>y@!f}_YqLnqa$H@?{$IOfq?L}!2 z1vHd2v=lV-M^8ruI?>T1{sKzgn^|vWKVq^el8d7Ae7uj}`@N6(EdF9^=WUgou?hX- zjVn)2OXiI{rDc_9uj6H$<*D*UCs!U%y^-JX3Od0P@YShL=b|`;FUh{FxMXlFqq$<& z9eXF3-!WI`w?GGPx);vxUC_K@H|kPR#W{X2D;#$`>q!@F?RCINR%i}B$gXh&*3p;o z^`nwX%{3QASCuSz^JMHTuE5E5?@=b?gzD}ElM0TugPlYa8bofVRMU6pg%rDqNLjL7 zRC)fM`L+gp_hAhs0V@+B0WC6vj(8JV@Qf)Pfhak}of>8WIM$ z@pz%V>ke7C;lqB)GFs&t?xR_*h=P{tOy~%MH!s4%vq7JsHi3} zlN*TbH;K)7&LVYh*#XG5JOwl$R1wN^Y-Pc<_MUQ0JH<8H)@XukVDItGaKas`-y|0ylesYK+}q#!&cTEH(F zVr&55Q?AZksO`}Rx82$^uNhZRG){R!^JCLEu(p*h5L8NPDje5r0<{;aRc-%jT|D>w zgXqHd2PomT4D=6RvqtEg8R50awH47vMCuE(RiL@j9X(C~%> zfHxdAku`Ml`Y!N>F1t5@H~3&fcyn>dB#D8i3v-`Yj?D5yBz&-s6g_CdFxcOeqHgtF z!O-?7D5YzvX|QQys>*U5h-Uc#LBy&VHdf8R<<*^t*jP22Kn&N;_p|Q)xJcx*G@~L| zK_*O7Gh@ZMh~0FC&8XhQ^;!3v_jJ!%u-XtqGsoRhw4awb*P6o149ZHTm|hrV6z)dO z#*{QoJ~rD4v8Nlop#FxDsFHFZliFL{Kywz)_BrcU|fW0+7&$rQeGVboAU zYIu62kT@`&!x+t#vD~#A&qU_TZSO!FqsBIiTYr7~&(A-8_08~g`0@w({`k-K_w9E3 z*WZ5m>DBAM|NVWd)oQ)kQXw7f?ZNP>_jtnK8F(5M0tOqLG9!Gqsy!s)UJrk|y>2*6 z`|Q9EyZggr(CrViVVA}O*3AZk1Df^YgeCFr?De$s^rL5=zj%E7*4OX0TQ~n&?e;e4 HUbgN(x-vds literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/snap-7564025723254944482-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.avro b/tests/queries/0_stateless/data_minio/iceberg_no_column_stats/metadata/snap-7564025723254944482-0-39d4e713-69c8-49f9-ab8e-f887af4bcecb.avro new file mode 100644 index 0000000000000000000000000000000000000000..1ebf121994f5ec1f70c9df177f4c7a86c5ab079f GIT binary patch literal 1772 zcmb7F&x_MQ7;OuR2TvmFMI;3AwAm(Y+NOJ55CmaC@l;EkOulSKlbJX(*;-2f9zBUi z1<{K)4 zdj8P&{h)I@M;xehjB8hDk*2oturK6?=UHoe`yV zRq34>+^)fZ;uzwvYM*nS>lL{E23|LU2r(;mutp7!p*ZQcag0d7~)nMixdtjck2O9n5W& zvP^G{)szMbT8$av9L+R}NXYa^|7#KOo*U@9Tjzghh|=SM2opJ}*4jg}&3>_2&;3PpXK#3n=z>$-=2l{Vv dcGPIJTAx0@d-eA0$FHx=R-