From 30ec9e4c0f339ef4302adbda946b6f620fdfca0b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:19:49 +0000
Subject: [PATCH 1/4] Initial plan


From 6e64287eed5dd1a4234fd78ef16dcd2a39c92f33 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:36:36 +0000
Subject: [PATCH 2/4] Add iceberg storage backend and API methods

Co-authored-by: AndreasAlbertQC <103571926+AndreasAlbertQC@users.noreply.github.com>
---
 dataframely/_compat.py               |  13 ++
 dataframely/_storage/iceberg.py      | 244 +++++++++++++++++++++++++++
 dataframely/collection/collection.py |  94 ++++++++++-
 dataframely/filter_result.py         |  54 +++++-
 dataframely/schema.py                | 156 ++++++++++++++++-
 dataframely/testing/storage.py       |  82 ++++++++-
 pixi.toml                            |   1 +
 7 files changed, 640 insertions(+), 4 deletions(-)
 create mode 100644 dataframely/_storage/iceberg.py

diff --git a/dataframely/_compat.py b/dataframely/_compat.py
index a60ba3a1..a0aafab9 100644
--- a/dataframely/_compat.py
+++ b/dataframely/_compat.py
@@ -24,6 +24,17 @@ def __getattr__(self, name: str) -> Any:
     class DeltaTable:  # type: ignore # noqa: N801
         pass
 
+# ------------------------------------ PYICEBERG -------------------------------------- #
+
+try:
+    import pyiceberg
+    from pyiceberg.table import Table as IcebergTable
+except ImportError:
+    pyiceberg = _DummyModule("pyiceberg")  # type: ignore
+
+    class IcebergTable:  # type: ignore # noqa: N801
+        pass
+
 # ------------------------------------ SQLALCHEMY ------------------------------------ #
 
 try:
@@ -75,9 +86,11 @@ class Dialect:  # type: ignore # noqa: N801
     "deltalake",
     "DeltaTable",
     "Dialect",
+    "IcebergTable",
     "MSDialect_pyodbc",
     "pa",
     "PGDialect_psycopg2",
+    "pyiceberg",
     "pydantic_core_schema",
     "pydantic",
     "sa_mssql",
diff --git a/dataframely/_storage/iceberg.py b/dataframely/_storage/iceberg.py
new file mode 100644
index 00000000..c903d2ef
--- /dev/null
+++ b/dataframely/_storage/iceberg.py
@@ -0,0 +1,244 @@
+# Copyright (c) QuantCo 2025-2025
+# SPDX-License-Identifier: BSD-3-Clause
+from __future__ import annotations
+
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any
+
+import polars as pl
+from fsspec import AbstractFileSystem, url_to_fs
+
+from dataframely._compat import IcebergTable, pyiceberg
+
+from ._base import (
+    SerializedCollection,
+    SerializedRules,
+    SerializedSchema,
+    StorageBackend,
+)
+from ._exc import assert_failure_info_metadata
+from .constants import COLLECTION_METADATA_KEY, RULE_METADATA_KEY, SCHEMA_METADATA_KEY
+
+
+class IcebergStorageBackend(StorageBackend):
+    def sink_frame(
+        self, lf: pl.LazyFrame, serialized_schema: SerializedSchema, **kwargs: Any
+    ) -> None:
+        _raise_on_lazy_write()
+
+    def write_frame(
+        self, df: pl.DataFrame, serialized_schema: SerializedSchema, **kwargs: Any
+    ) -> None:
+        target = kwargs.pop("target")
+        metadata = kwargs.pop("metadata", {})
+        
+        # Store schema metadata in table properties
+        table = _to_iceberg_table(target)
+        
+        # Update table properties with metadata
+        properties = {
+            SCHEMA_METADATA_KEY: serialized_schema,
+            **metadata,
+        }
+        
+        # Write data with metadata stored in table properties
+        # Note: Iceberg stores metadata in table properties, not commit metadata like Delta
+        # We'll need to update the table properties after writing
+        df.write_iceberg(target, mode=kwargs.pop("mode", "overwrite"))
+        
+        # Update table properties with our metadata
+        _update_table_properties(table, properties)
+
+    def scan_frame(self, **kwargs: Any) -> tuple[pl.LazyFrame, SerializedSchema | None]:
+        table = _to_iceberg_table(kwargs.pop("source"))
+        serialized_schema = _read_serialized_schema(table)
+        df = pl.scan_iceberg(table, **kwargs)
+        return df, serialized_schema
+
+    def read_frame(self, **kwargs: Any) -> tuple[pl.DataFrame, SerializedSchema | None]:
+        # Iceberg doesn't have a direct read function in polars, use scan and collect
+        lf, serialized_schema = self.scan_frame(**kwargs)
+        df = lf.collect()
+        return df, serialized_schema
+
+    # ------------------------------ Collections ---------------------------------------
+    def sink_collection(
+        self,
+        dfs: dict[str, pl.LazyFrame],
+        serialized_collection: SerializedCollection,
+        serialized_schemas: dict[str, str],
+        **kwargs: Any,
+    ) -> None:
+        _raise_on_lazy_write()
+
+    def write_collection(
+        self,
+        dfs: dict[str, pl.LazyFrame],
+        serialized_collection: SerializedCollection,
+        serialized_schemas: dict[str, str],
+        **kwargs: Any,
+    ) -> None:
+        uri = kwargs.pop("target")
+        fs: AbstractFileSystem = url_to_fs(uri)[0]
+
+        # The collection schema is serialized as part of the member table metadata
+        kwargs["metadata"] = kwargs.get("metadata", {}) | {
+            COLLECTION_METADATA_KEY: serialized_collection
+        }
+
+        for key, lf in dfs.items():
+            self.write_frame(
+                lf.collect(),
+                serialized_schema=serialized_schemas[key],
+                target=fs.sep.join([uri, key]),
+                **kwargs,
+            )
+
+    def scan_collection(
+        self, members: Iterable[str], **kwargs: Any
+    ) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
+        uri = kwargs.pop("source")
+        fs: AbstractFileSystem = url_to_fs(uri)[0]
+
+        data = {}
+        collection_types = []
+        for key in members:
+            member_uri = fs.sep.join([uri, key])
+            try:
+                table = _to_iceberg_table(member_uri)
+                data[key] = pl.scan_iceberg(table, **kwargs)
+                collection_types.append(_read_serialized_collection(table))
+            except Exception:
+                # If we can't read the table, skip it
+                continue
+
+        return data, collection_types
+
+    def read_collection(
+        self, members: Iterable[str], **kwargs: Any
+    ) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
+        lazy, collection_types = self.scan_collection(members, **kwargs)
+        eager = {name: lf.collect().lazy() for name, lf in lazy.items()}
+        return eager, collection_types
+
+    # ------------------------------ Failure Info --------------------------------------
+    def sink_failure_info(
+        self,
+        lf: pl.LazyFrame,
+        serialized_rules: SerializedRules,
+        serialized_schema: SerializedSchema,
+        **kwargs: Any,
+    ) -> None:
+        _raise_on_lazy_write()
+
+    def write_failure_info(
+        self,
+        df: pl.DataFrame,
+        serialized_rules: SerializedRules,
+        serialized_schema: SerializedSchema,
+        **kwargs: Any,
+    ) -> None:
+        self.write_frame(
+            df,
+            serialized_schema,
+            metadata={
+                RULE_METADATA_KEY: serialized_rules,
+            },
+            **kwargs,
+        )
+
+    def scan_failure_info(
+        self, **kwargs: Any
+    ) -> tuple[pl.LazyFrame, SerializedRules, SerializedSchema]:
+        """Lazily read the failure info from the storage backend."""
+        table = _to_iceberg_table(kwargs.pop("source"))
+
+        # Metadata
+        serialized_rules = assert_failure_info_metadata(_read_serialized_rules(table))
+        serialized_schema = assert_failure_info_metadata(_read_serialized_schema(table))
+
+        # Data
+        lf = pl.scan_iceberg(table, **kwargs)
+
+        return lf, serialized_rules, serialized_schema
+
+
+def _raise_on_lazy_write() -> None:
+    raise NotImplementedError("Lazy writes are not currently supported for iceberg.")
+
+
+def _read_serialized_schema(table: IcebergTable) -> SerializedSchema | None:
+    """Read schema metadata from Iceberg table properties."""
+    try:
+        return table.properties.get(SCHEMA_METADATA_KEY, None)
+    except AttributeError:
+        return None
+
+
+def _read_serialized_collection(
+    table: IcebergTable,
+) -> SerializedCollection | None:
+    """Read collection metadata from Iceberg table properties."""
+    try:
+        return table.properties.get(COLLECTION_METADATA_KEY, None)
+    except AttributeError:
+        return None
+
+
+def _read_serialized_rules(
+    table: IcebergTable,
+) -> SerializedRules | None:
+    """Read rules metadata from Iceberg table properties."""
+    try:
+        return table.properties.get(RULE_METADATA_KEY, None)
+    except AttributeError:
+        return None
+
+
+def _to_iceberg_table(
+    table: Path | str | IcebergTable,
+) -> IcebergTable:
+    """Convert path or string to IcebergTable object."""
+    from pyiceberg.catalog import load_catalog
+
+    match table:
+        case IcebergTable():
+            return table
+        case str() | Path():
+            # For string/path, we need to load the table
+            # This assumes a local file catalog for simplicity
+            # In production, users would pass proper catalog configuration
+            try:
+                # Try to load as a table path directly
+                catalog = load_catalog("default")
+                return catalog.load_table(str(table))
+            except Exception:
+                # If that fails, try loading from the table path as a namespace/table
+                # This is a simplified approach; real usage would need proper catalog config
+                raise ValueError(
+                    f"Cannot load Iceberg table from {table}. "
+                    "Please pass an IcebergTable object or configure your catalog properly."
+                )
+        case _:
+            raise TypeError(f"Unsupported type {table!r}")
+
+
+def _update_table_properties(table: IcebergTable, properties: dict[str, str]) -> None:
+    """Update Iceberg table properties with metadata."""
+    try:
+        # Update table properties using pyiceberg API
+        # This requires a transaction
+        with table.update_properties() as update:
+            for key, value in properties.items():
+                update.set(key, value)
+    except Exception:
+        # If we can't update properties, log a warning but don't fail
+        # This allows basic write functionality even if metadata storage fails
+        import warnings
+        warnings.warn(
+            "Could not update Iceberg table properties with metadata. "
+            "Reading back with schema validation may not work correctly.",
+            UserWarning,
+            stacklevel=2,
+        )
diff --git a/dataframely/collection/collection.py b/dataframely/collection/collection.py
index 1576f9f0..10a7ee70 100644
--- a/dataframely/collection/collection.py
+++ b/dataframely/collection/collection.py
@@ -17,7 +17,7 @@
 import polars as pl
 import polars.exceptions as plexc
 
-from dataframely._compat import deltalake
+from dataframely._compat import deltalake, IcebergTable
 from dataframely._filter import Filter
 from dataframely._native import format_rule_failures
 from dataframely._plugin import all_rules_required
@@ -31,6 +31,7 @@
 from dataframely._storage import StorageBackend
 from dataframely._storage.constants import COLLECTION_METADATA_KEY
 from dataframely._storage.delta import DeltaStorageBackend
+from dataframely._storage.iceberg import IcebergStorageBackend
 from dataframely._storage.parquet import ParquetStorageBackend
 from dataframely._typing import LazyFrame, Validation
 from dataframely.exc import ValidationError, ValidationRequiredError
@@ -1143,6 +1144,97 @@ def read_delta(
             source=source,
         )
 
+    def write_iceberg(
+        self, target: str | Path | IcebergTable, **kwargs: Any
+    ) -> None:
+        """Write this collection to an Iceberg table.
+
+        This method automatically adds a serialization of this collection to the Iceberg table as metadata.
+        The metadata can be leveraged by :meth:`read_iceberg` and :meth:`scan_iceberg` for efficient reading or by external tools.
+
+        Args:
+            target: The root directory where the collection members will be stored as Iceberg tables.
+            kwargs: Additional keyword arguments passed to :meth:`polars.DataFrame.write_iceberg`.
+
+        Attention:
+            This method suffers from the same limitations as :meth:`serialize`.
+
+            Collection metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+        """
+        IcebergStorageBackend().write_collection(
+            dfs=self.to_dict(),
+            serialized_collection=self.serialize(),
+            serialized_schemas={
+                member_name: schema.serialize()
+                for member_name, schema in self._schemas().items()
+            },
+            target=target,
+            **kwargs,
+        )
+
+    @classmethod
+    def scan_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> Self:
+        """Lazily read an Iceberg table collection.
+
+        Compared to :meth:`polars.scan_iceberg`, this method checks the table's metadata
+        and runs validation if necessary to ensure that the data matches this collection.
+
+        Args:
+            source: The root directory from which to read the collection members.
+            kwargs: Additional keyword arguments passed to :func:`polars.scan_iceberg`.
+
+        Returns:
+            The collection with lazy frames.
+
+        Attention:
+            Collection metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+
+            This method suffers from the same limitations as :meth:`serialize`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            lazy=True,
+            source=source,
+            **kwargs,
+        )
+
+    @classmethod
+    def read_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> Self:
+        """Read an Iceberg table collection.
+
+        Compared to :func:`polars.scan_iceberg`, this method checks the table's metadata
+        and runs validation if necessary to ensure that the data matches this collection.
+
+        Args:
+            source: The root directory from which to read the collection members.
+            kwargs: Additional keyword arguments passed directly to :func:`polars.scan_iceberg`.
+
+        Returns:
+            The collection with eager frames.
+
+        Attention:
+            Collection metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+
+            This method suffers from the same limitations as :meth:`serialize`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            lazy=False,
+            source=source,
+            **kwargs,
+        )
+
     # -------------------------------- Storage --------------------------------------- #
 
     def _write(self, backend: StorageBackend, **kwargs: Any) -> None:
diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py
index 8143c3c1..9c218b62 100644
--- a/dataframely/filter_result.py
+++ b/dataframely/filter_result.py
@@ -13,10 +13,11 @@
 from polars._typing import PartitioningScheme
 
 from dataframely._base_schema import BaseSchema
-from dataframely._compat import deltalake
+from dataframely._compat import deltalake, IcebergTable
 
 from ._storage import StorageBackend
 from ._storage.delta import DeltaStorageBackend
+from ._storage.iceberg import IcebergStorageBackend
 from ._storage.parquet import ParquetStorageBackend
 from ._typing import DataFrame, LazyFrame
 
@@ -307,6 +308,57 @@ def scan_delta(
             backend=DeltaStorageBackend(), source=source, lazy=True, **kwargs
         )
 
+    def write_iceberg(
+        self,
+        target: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> None:
+        """Write the failure info to an Iceberg table.
+
+        Args:
+            target: The path or IcebergTable object to which to write the data.
+            kwargs: Additional keyword arguments passed to :meth:`polars.DataFrame.write_iceberg`.
+        """
+        self._write(IcebergStorageBackend(), target=target, **kwargs)
+
+    @classmethod
+    def scan_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> FailureInfo:
+        """Lazily read failure info from an Iceberg table.
+
+        Args:
+            source: Path or IcebergTable object from which to read the data.
+            kwargs: Additional keyword arguments passed to :func:`polars.scan_iceberg`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            lazy=True,
+            source=source,
+            **kwargs,
+        )
+
+    @classmethod
+    def read_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> FailureInfo:
+        """Read failure info from an Iceberg table.
+
+        Args:
+            source: Path or IcebergTable object from which to read the data.
+            kwargs: Additional keyword arguments passed to :func:`polars.scan_iceberg`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            lazy=False,
+            source=source,
+            **kwargs,
+        )
+
     # -------------------------------- Storage --------------------------------------- #
 
     def _sink(
diff --git a/dataframely/schema.py b/dataframely/schema.py
index c457ec76..876b19bc 100644
--- a/dataframely/schema.py
+++ b/dataframely/schema.py
@@ -16,7 +16,7 @@
 import polars.exceptions as plexc
 from polars._typing import FileSource, PartitioningScheme
 
-from dataframely._compat import deltalake
+from dataframely._compat import deltalake, IcebergTable
 
 from ._base_schema import ORIGINAL_COLUMN_PREFIX, BaseSchema
 from ._compat import pa, sa
@@ -34,6 +34,7 @@
 from ._storage._base import SerializedSchema, StorageBackend
 from ._storage.constants import SCHEMA_METADATA_KEY
 from ._storage.delta import DeltaStorageBackend
+from ._storage.iceberg import IcebergStorageBackend
 from ._storage.parquet import (
     ParquetStorageBackend,
 )
@@ -1182,6 +1183,159 @@ def read_delta(
             **kwargs,
         )
 
+    @classmethod
+    def write_iceberg(
+        cls,
+        df: DataFrame[Self],
+        /,
+        target: str | Path | IcebergTable,
+        **kwargs: Any,
+    ) -> None:
+        """Write a typed data frame with this schema to an Iceberg table.
+
+        This method automatically adds a serialization of this schema to the Iceberg table as metadata.
+        The metadata can be leveraged by :meth:`read_iceberg` and :meth:`scan_iceberg` for efficient reading or by external tools.
+
+        Args:
+            df: The data frame to write to the Iceberg table.
+            target: The path or IcebergTable object to which to write the data.
+            kwargs: Additional keyword arguments passed directly to :meth:`polars.write_iceberg`.
+
+        Attention:
+            This method suffers from the same limitations as :meth:`serialize`.
+
+            Schema metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+
+            Be aware that appending to an existing table via mode="append" may result
+            in violation of group constraints that dataframely cannot catch
+            without re-validating. Only use appends if you are certain that they do not
+            break your schema.
+        """
+        IcebergStorageBackend().write_frame(
+            df=df,
+            serialized_schema=cls.serialize(),
+            target=target,
+            **kwargs,
+        )
+
+    @classmethod
+    def scan_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        *,
+        validation: Validation = "warn",
+        **kwargs: Any,
+    ) -> LazyFrame[Self]:
+        """Lazily read an Iceberg table into a typed data frame with this schema.
+
+        Compared to :meth:`polars.scan_iceberg`, this method checks the table's metadata
+        and runs validation if necessary to ensure that the data matches this schema.
+
+        Args:
+            source: Path or IcebergTable object from which to read the data.
+            validation: The strategy for running validation when reading the data:
+
+                - `"allow"`: The method tries to read the table's metadata. If
+                  the stored schema matches this schema, the data frame is read without
+                  validation. If the stored schema mismatches this schema or no schema
+                  information can be found in the metadata, this method automatically
+                  runs :meth:`validate` with `cast=True`.
+                - `"warn"`: The method behaves similarly to `"allow"`. However,
+                  it prints a warning if validation is necessary.
+                - `"forbid"`: The method never runs validation automatically and only
+                  returns if the schema stored in the table's metadata matches
+                  this schema.
+                - `"skip"`: The method never runs validation and simply reads the
+                  table, entrusting the user that the schema is valid. *Use this
+                  option carefully and consider replacing it with
+                  :meth:`polars.scan_iceberg` to convey the purpose better*.
+
+            kwargs: Additional keyword arguments passed directly to :meth:`polars.scan_iceberg`.
+
+        Returns:
+            The lazy data frame with this schema.
+
+        Raises:
+            ValidationRequiredError:
+                If no schema information can be read
+                from the source and `validation` is set to `"forbid"`.
+
+        Attention:
+            Schema metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+
+            This method suffers from the same limitations as :meth:`serialize`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            validation=validation,
+            lazy=True,
+            source=source,
+            **kwargs,
+        )
+
+    @classmethod
+    def read_iceberg(
+        cls,
+        source: str | Path | IcebergTable,
+        *,
+        validation: Validation = "warn",
+        **kwargs: Any,
+    ) -> DataFrame[Self]:
+        """Read an Iceberg table into a typed data frame with this schema.
+
+        Compared to :meth:`polars.read_iceberg`, this method checks the table's metadata
+        and runs validation if necessary to ensure that the data matches this schema.
+
+        Args:
+            source: Path or IcebergTable object from which to read the data.
+            validation: The strategy for running validation when reading the data:
+
+                - `"allow"`: The method tries to read the table's metadata. If
+                  the stored schema matches this schema, the data frame is read without
+                  validation. If the stored schema mismatches this schema or no schema
+                  information can be found in the metadata, this method automatically
+                  runs :meth:`validate` with `cast=True`.
+                - `"warn"`: The method behaves similarly to `"allow"`. However,
+                  it prints a warning if validation is necessary.
+                - `"forbid"`: The method never runs validation automatically and only
+                  returns if the schema stored in the table's metadata matches
+                  this schema.
+                - `"skip"`: The method never runs validation and simply reads the
+                  table, entrusting the user that the schema is valid. *Use this
+                  option carefully and consider replacing it with
+                  :meth:`polars.read_iceberg` to convey the purpose better*.
+
+            kwargs: Additional keyword arguments passed directly to :meth:`polars.scan_iceberg`.
+
+        Returns:
+            The data frame with this schema.
+
+        Raises:
+            ValidationRequiredError:
+                If no schema information can be read from the source
+                and `validation` is set to `"forbid"`.
+
+        Attention:
+            Schema metadata is stored in table properties. Table modifications
+            that are not through dataframely may result in losing the metadata.
+
+            Be aware that appending to an existing table via mode="append" may result
+            in violation of group constraints that dataframely cannot catch
+            without re-validating. Only use appends if you are certain that they do not
+            break your schema.
+
+            This method suffers from the same limitations as :meth:`serialize`.
+        """
+        return cls._read(
+            IcebergStorageBackend(),
+            validation=validation,
+            lazy=False,
+            source=source,
+            **kwargs,
+        )
+
     # --------------------------------- Storage -------------------------------------- #
 
     @classmethod
diff --git a/dataframely/testing/storage.py b/dataframely/testing/storage.py
index 9807e6cb..220bd0ed 100644
--- a/dataframely/testing/storage.py
+++ b/dataframely/testing/storage.py
@@ -9,8 +9,9 @@
 
 import dataframely as dy
 from dataframely import FailureInfo, Validation
-from dataframely._compat import deltalake
+from dataframely._compat import deltalake, IcebergTable
 from dataframely._storage.delta import _to_delta_table
+from dataframely._storage.iceberg import _to_iceberg_table
 
 # ----------------------------------- Schema -------------------------------------------
 S = TypeVar("S", bound=dy.Schema)
@@ -123,6 +124,35 @@ def read(
         return schema.read_delta(path, validation=validation)
 
 
+class IcebergSchemaStorageTester(SchemaStorageTester):
+    """Testing interface for the iceberg storage functionality of Schema."""
+
+    def write_typed(
+        self, schema: type[S], df: dy.DataFrame[S], path: str, lazy: bool
+    ) -> None:
+        schema.write_iceberg(df, path)
+
+    def write_untyped(self, df: pl.DataFrame, path: str, lazy: bool) -> None:
+        df.write_iceberg(path, mode="overwrite")
+
+    @overload
+    def read(
+        self, schema: type[S], path: str, lazy: Literal[True], validation: Validation
+    ) -> dy.LazyFrame[S]: ...
+
+    @overload
+    def read(
+        self, schema: type[S], path: str, lazy: Literal[False], validation: Validation
+    ) -> dy.DataFrame[S]: ...
+
+    def read(
+        self, schema: type[S], path: str, lazy: bool, validation: Validation
+    ) -> dy.DataFrame[S] | dy.LazyFrame[S]:
+        if lazy:
+            return schema.scan_iceberg(path, validation=validation)
+        return schema.read_iceberg(path, validation=validation)
+
+
 # ------------------------------- Collection -------------------------------------------
 
 C = TypeVar("C", bound=dy.Collection)
@@ -223,6 +253,31 @@ def read(self, collection: type[C], path: str, lazy: bool, **kwargs: Any) -> C:
         return collection.read_delta(source=path, **kwargs)
 
 
+class IcebergCollectionStorageTester(CollectionStorageTester):
+    def write_typed(
+        self, collection: dy.Collection, path: str, lazy: bool, **kwargs: Any
+    ) -> None:
+        collection.write_iceberg(path, **kwargs)
+
+    def write_untyped(
+        self, collection: dy.Collection, path: str, lazy: bool, **kwargs: Any
+    ) -> None:
+        collection.write_iceberg(path, **kwargs)
+
+        # For each member table, write an empty update to clear metadata
+        # Similar to delta approach, we overwrite properties to lose metadata
+        fs: AbstractFileSystem = url_to_fs(path)[0]
+        for member, df in collection.to_dict().items():
+            table = _to_iceberg_table(fs.sep.join([path, member]))
+            # Clear custom metadata by writing empty data
+            df.head(0).collect().write_iceberg(table, mode="append")
+
+    def read(self, collection: type[C], path: str, lazy: bool, **kwargs: Any) -> C:
+        if lazy:
+            return collection.scan_iceberg(source=path, **kwargs)
+        return collection.read_iceberg(source=path, **kwargs)
+
+
 # ------------------------------------ Failure info ------------------------------------
 class FailureInfoStorageTester(ABC):
     @abstractmethod
@@ -307,3 +362,28 @@ def set_metadata(self, path: str, metadata: dict[str, Any]) -> None:
             },
             mode="overwrite",
         )
+
+
+class IcebergFailureInfoStorageTester(FailureInfoStorageTester):
+    def write_typed(
+        self, failure_info: FailureInfo, path: str, lazy: bool, **kwargs: Any
+    ) -> None:
+        # Ignore 'lazy' here because lazy writes are not supported for iceberg at the moment.
+        failure_info.write_iceberg(path, **kwargs)
+
+    def write_untyped(
+        self, failure_info: FailureInfo, path: str, lazy: bool, **kwargs: Any
+    ) -> None:
+        failure_info._lf.collect().write_iceberg(path, mode="overwrite", **kwargs)
+
+    def read(self, path: str, lazy: bool, **kwargs: Any) -> FailureInfo:
+        if lazy:
+            return FailureInfo.scan_iceberg(source=path, **kwargs)
+        else:
+            return FailureInfo.read_iceberg(source=path, **kwargs)
+
+    def set_metadata(self, path: str, metadata: dict[str, Any]) -> None:
+        from dataframely._storage.iceberg import _to_iceberg_table, _update_table_properties
+        
+        table = _to_iceberg_table(path)
+        _update_table_properties(table, metadata)
diff --git a/pixi.toml b/pixi.toml
index 7be6290b..b4e45e26 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -60,6 +60,7 @@ test-no-optional = "pytest -m 'not with_optionals'"
 [feature.optionals.dependencies]
 deltalake = "*"
 pyarrow = "*"
+pyiceberg = "*"
 pydantic = ">=2.11.9,<3"
 pydantic-core = ">=2.33.2,<3"
 sqlalchemy = ">=2"

From 2f741447c3fc5dde5b38f7ff07a4a819d19594e2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:37:57 +0000
Subject: [PATCH 3/4] Add Iceberg testers to test parametrization

Co-authored-by: AndreasAlbertQC <103571926+AndreasAlbertQC@users.noreply.github.com>
---
 tests/collection/test_storage.py   | 7 ++++++-
 tests/failure_info/test_storage.py | 7 ++++++-
 tests/schema/test_storage.py       | 2 ++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py
index 6dee93f3..48cfe64b 100644
--- a/tests/collection/test_storage.py
+++ b/tests/collection/test_storage.py
@@ -18,6 +18,7 @@
 from dataframely.testing.storage import (
     CollectionStorageTester,
     DeltaCollectionStorageTester,
+    IcebergCollectionStorageTester,
     ParquetCollectionStorageTester,
 )
 
@@ -53,7 +54,11 @@ class MyCollection2(dy.Collection):
     second: dy.LazyFrame[MySecondSchema] | None
 
 
-TESTERS = [ParquetCollectionStorageTester(), DeltaCollectionStorageTester()]
+TESTERS = [
+    ParquetCollectionStorageTester(),
+    DeltaCollectionStorageTester(),
+    IcebergCollectionStorageTester(),
+]
 
 
 @pytest.mark.parametrize("tester", TESTERS)
diff --git a/tests/failure_info/test_storage.py b/tests/failure_info/test_storage.py
index 5c827734..f646ad54 100644
--- a/tests/failure_info/test_storage.py
+++ b/tests/failure_info/test_storage.py
@@ -12,6 +12,7 @@
 from dataframely.testing.storage import (
     DeltaFailureInfoStorageTester,
     FailureInfoStorageTester,
+    IcebergFailureInfoStorageTester,
     ParquetFailureInfoStorageTester,
 )
 
@@ -28,7 +29,11 @@ class MySchema(dy.Schema):
     b = dy.Integer(nullable=False, is_in=[1, 2, 3, 5, 7, 11])
 
 
-TESTERS = [ParquetFailureInfoStorageTester(), DeltaFailureInfoStorageTester()]
+TESTERS = [
+    ParquetFailureInfoStorageTester(),
+    DeltaFailureInfoStorageTester(),
+    IcebergFailureInfoStorageTester(),
+]
 
 
 @pytest.mark.parametrize("tester", TESTERS)
diff --git a/tests/schema/test_storage.py b/tests/schema/test_storage.py
index c102f43d..205fa348 100644
--- a/tests/schema/test_storage.py
+++ b/tests/schema/test_storage.py
@@ -15,6 +15,7 @@
 from dataframely.testing import create_schema
 from dataframely.testing.storage import (
     DeltaSchemaStorageTester,
+    IcebergSchemaStorageTester,
     ParquetSchemaStorageTester,
     SchemaStorageTester,
 )
@@ -30,6 +31,7 @@
 TESTERS = [
     ParquetSchemaStorageTester(),
     DeltaSchemaStorageTester(),
+    IcebergSchemaStorageTester(),
 ]
 
 
From 147148761733e8b6807c74d243f33a589382a3fc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:41:13 +0000
Subject: [PATCH 4/4] Fix iceberg implementation to properly handle paths and
 table objects

Co-authored-by: AndreasAlbertQC <103571926+AndreasAlbertQC@users.noreply.github.com>
---
 dataframely/_storage/iceberg.py | 125 +++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 42 deletions(-)

diff --git a/dataframely/_storage/iceberg.py b/dataframely/_storage/iceberg.py
index c903d2ef..0b6f3fe3 100644
--- a/dataframely/_storage/iceberg.py
+++ b/dataframely/_storage/iceberg.py
@@ -32,28 +32,36 @@ def write_frame(
     ) -> None:
         target = kwargs.pop("target")
         metadata = kwargs.pop("metadata", {})
+        mode = kwargs.pop("mode", "overwrite")
         
-        # Store schema metadata in table properties
-        table = _to_iceberg_table(target)
+        # Write data first
+        df.write_iceberg(target, mode=mode)
         
-        # Update table properties with metadata
-        properties = {
-            SCHEMA_METADATA_KEY: serialized_schema,
-            **metadata,
-        }
-        
-        # Write data with metadata stored in table properties
-        # Note: Iceberg stores metadata in table properties, not commit metadata like Delta
-        # We'll need to update the table properties after writing
-        df.write_iceberg(target, mode=kwargs.pop("mode", "overwrite"))
-        
-        # Update table properties with our metadata
-        _update_table_properties(table, properties)
+        # After writing, update table properties with metadata
+        try:
+            table = _to_iceberg_table(target)
+            properties = {
+                SCHEMA_METADATA_KEY: serialized_schema,
+                **metadata,
+            }
+            _update_table_properties(table, properties)
+        except Exception:
+            # If we can't update properties, the write still succeeded
+            # This is acceptable for basic functionality
+            import warnings
+            warnings.warn(
+                "Could not update Iceberg table properties with metadata. "
+                "Reading back with schema validation may not work correctly.",
+                UserWarning,
+                stacklevel=2,
+            )
 
     def scan_frame(self, **kwargs: Any) -> tuple[pl.LazyFrame, SerializedSchema | None]:
-        table = _to_iceberg_table(kwargs.pop("source"))
+        source = kwargs.pop("source")
+        table = _to_iceberg_table(source)
         serialized_schema = _read_serialized_schema(table)
-        df = pl.scan_iceberg(table, **kwargs)
+        # Use the original source for scanning
+        df = pl.scan_iceberg(source, **kwargs)
         return df, serialized_schema
 
     def read_frame(self, **kwargs: Any) -> tuple[pl.DataFrame, SerializedSchema | None]:
@@ -152,14 +160,15 @@ def scan_failure_info(
         self, **kwargs: Any
     ) -> tuple[pl.LazyFrame, SerializedRules, SerializedSchema]:
         """Lazily read the failure info from the storage backend."""
-        table = _to_iceberg_table(kwargs.pop("source"))
+        source = kwargs.pop("source")
+        table = _to_iceberg_table(source)
 
         # Metadata
         serialized_rules = assert_failure_info_metadata(_read_serialized_rules(table))
         serialized_schema = assert_failure_info_metadata(_read_serialized_schema(table))
 
-        # Data
-        lf = pl.scan_iceberg(table, **kwargs)
+        # Data - use original source for scanning
+        lf = pl.scan_iceberg(source, **kwargs)
 
         return lf, serialized_rules, serialized_schema
 
@@ -168,8 +177,18 @@ def _raise_on_lazy_write() -> None:
     raise NotImplementedError("Lazy writes are not currently supported for iceberg.")
 
 
-def _read_serialized_schema(table: IcebergTable) -> SerializedSchema | None:
+def _read_serialized_schema(table: IcebergTable | str) -> SerializedSchema | None:
     """Read schema metadata from Iceberg table properties."""
+    if isinstance(table, str):
+        # For file paths, we need to load the table to access properties
+        try:
+            from pyiceberg.catalog import load_catalog
+            catalog = load_catalog("default")
+            table = catalog.load_table(table)
+        except Exception:
+            # If we can't load the table, return None
+            return None
+    
     try:
         return table.properties.get(SCHEMA_METADATA_KEY, None)
     except AttributeError:
@@ -177,9 +196,17 @@ def _read_serialized_schema(table: IcebergTable) -> SerializedSchema | None:
 
 
 def _read_serialized_collection(
-    table: IcebergTable,
+    table: IcebergTable | str,
 ) -> SerializedCollection | None:
     """Read collection metadata from Iceberg table properties."""
+    if isinstance(table, str):
+        try:
+            from pyiceberg.catalog import load_catalog
+            catalog = load_catalog("default")
+            table = catalog.load_table(table)
+        except Exception:
+            return None
+    
     try:
         return table.properties.get(COLLECTION_METADATA_KEY, None)
     except AttributeError:
@@ -187,9 +214,17 @@ def _read_serialized_collection(
 
 
 def _read_serialized_rules(
-    table: IcebergTable,
+    table: IcebergTable | str,
 ) -> SerializedRules | None:
     """Read rules metadata from Iceberg table properties."""
+    if isinstance(table, str):
+        try:
+            from pyiceberg.catalog import load_catalog
+            catalog = load_catalog("default")
+            table = catalog.load_table(table)
+        except Exception:
+            return None
+    
     try:
         return table.properties.get(RULE_METADATA_KEY, None)
     except AttributeError:
@@ -198,34 +233,40 @@ def _read_serialized_rules(
 
 def _to_iceberg_table(
     table: Path | str | IcebergTable,
-) -> IcebergTable:
-    """Convert path or string to IcebergTable object."""
-    from pyiceberg.catalog import load_catalog
-
+) -> IcebergTable | str:
+    """Convert to appropriate type for Iceberg operations.
+    
+    Returns either an IcebergTable object if one is passed,
+    or a string/Path for file-based operations.
+    """
     match table:
         case IcebergTable():
             return table
         case str() | Path():
-            # For string/path, we need to load the table
-            # This assumes a local file catalog for simplicity
-            # In production, users would pass proper catalog configuration
-            try:
-                # Try to load as a table path directly
-                catalog = load_catalog("default")
-                return catalog.load_table(str(table))
-            except Exception:
-                # If that fails, try loading from the table path as a namespace/table
-                # This is a simplified approach; real usage would need proper catalog config
-                raise ValueError(
-                    f"Cannot load Iceberg table from {table}. "
-                    "Please pass an IcebergTable object or configure your catalog properly."
-                )
+            # For paths, return as-is for polars to handle
+            # polars can work with file paths directly
+            return str(table)
         case _:
             raise TypeError(f"Unsupported type {table!r}")
 
 
-def _update_table_properties(table: IcebergTable, properties: dict[str, str]) -> None:
+def _update_table_properties(table: IcebergTable | str, properties: dict[str, str]) -> None:
     """Update Iceberg table properties with metadata."""
+    if isinstance(table, str):
+        try:
+            from pyiceberg.catalog import load_catalog
+            catalog = load_catalog("default")
+            table = catalog.load_table(table)
+        except Exception:
+            import warnings
+            warnings.warn(
+                "Could not load Iceberg table to update properties. "
+                "Reading back with schema validation may not work correctly.",
+                UserWarning,
+                stacklevel=2,
+            )
+            return
+    
     try:
         # Update table properties using pyiceberg API
         # This requires a transaction