apache · jayceslesar · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025 · May 2, 2025
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import itertools
+import logging
 import os
 import uuid
 import warnings
@@ -31,6 +32,7 @@
     Callable,
     Dict,
     Iterable,
+    Iterator,
     List,
     Optional,
     Set,
@@ -62,7 +64,7 @@
     inclusive_projection,
     manifest_evaluator,
 )
-from pyiceberg.io import FileIO, load_file_io
+from pyiceberg.io import FileIO, _parse_location, load_file_io
 from pyiceberg.manifest import (
     POSITIONAL_DELETE_SCHEMA,
     DataFile,
@@ -150,6 +152,8 @@
 
     from pyiceberg.catalog import Catalog
 
+logger = logging.getLogger(__name__)
+
 ALWAYS_TRUE = AlwaysTrue()
 DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
 
@@ -1371,6 +1375,45 @@ def to_polars(self) -> pl.LazyFrame:
 
         return pl.scan_iceberg(self)
 
+    def delete_orphaned_files(self) -> None:
+        """Delete orphaned files in the table."""
+        try:
+            import pyarrow as pa  # noqa: F401
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError("For deleting orphaned files PyArrow needs to be installed") from e
+
+        from pyarrow.fs import FileSelector, FileType
+
+        from pyiceberg.io.pyarrow import _fs_from_file_path
+
+        location = self.location()
+
+        all_known_files = []
+        snapshots = self.snapshots()
+        snapshot_ids = [snapshot.snapshot_id for snapshot in snapshots]
+        all_manifests_table = self.inspect.all_manifests(snapshots)
+        all_known_files.extend(all_manifests_table["path"].to_pylist())
+
+        executor = ExecutorFactory.get_or_create()
+        files_by_snapshots: Iterator["pa.Table"] = executor.map(lambda snapshot_id: self.inspect.files(snapshot_id), snapshot_ids)
+        all_known_files.extend(pa.concat_tables(files_by_snapshots)["file_path"].to_pylist())
+
+        fs = _fs_from_file_path(self.io, location)
+
+        _, _, path = _parse_location(location)
+        selector = FileSelector(path, recursive=True)
+        # filter to just files as it may return directories
+        all_files = [f.path for f in fs.get_file_info(selector) if f.type == FileType.File]
+
+        orphaned_files = set(all_files).difference(set(all_known_files))
+        logger.info(f"Found {len(orphaned_files)} orphaned files at {location}!")
+
+        if orphaned_files:
+            deletes = executor.map(self.io.delete, orphaned_files)
+            # exhaust
+            list(deletes)
+            logger.info(f"Deleted {len(orphaned_files)} orphaned files at {location}!")
+
 
 class StaticTable(Table):
     """Load a table directly from a metadata file (i.e., without using a catalog)."""

diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
@@ -645,10 +645,10 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
     def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
         return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES})
 
-    def all_manifests(self) -> "pa.Table":
+    def all_manifests(self, snapshots: Optional[list[Snapshot]] = None) -> "pa.Table":
         import pyarrow as pa
 
-        snapshots = self.tbl.snapshots()
+        snapshots = snapshots or self.tbl.snapshots()
         if not snapshots:
             return pa.Table.from_pylist([], schema=self._get_all_manifests_schema())
 

diff --git a/tests/table/test_delete_orphans.py b/tests/table/test_delete_orphans.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from pathlib import Path, PosixPath
+
+import pyarrow as pa
+import pytest
+
+from pyiceberg.catalog import Catalog
+from pyiceberg.schema import Schema
+from pyiceberg.types import IntegerType, NestedField, StringType
+from tests.catalog.test_base import InMemoryCatalog
+
+
+@pytest.fixture
+def catalog(tmp_path: PosixPath) -> InMemoryCatalog:
+    catalog = InMemoryCatalog("test.in_memory.catalog", warehouse=tmp_path.absolute().as_posix())
+    catalog.create_namespace("default")
+    return catalog
+
+
+def test_delete_orphaned_files(catalog: Catalog) -> None:
+    identifier = "default.test_delete_orphaned_files"
+
+    schema = Schema(
+        NestedField(1, "city", StringType(), required=True),
+        NestedField(2, "inhabitants", IntegerType(), required=True),
+        # Mark City as the identifier field, also known as the primary-key
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("city", pa.string(), nullable=False),
+            pa.field("inhabitants", pa.int32(), nullable=False),
+        ]
+    )
+
+    df = pa.Table.from_pylist(
+        [
+            {"city": "Drachten", "inhabitants": 45019},
+            {"city": "Drachten", "inhabitants": 45019},
+        ],
+        schema=arrow_schema,
+    )
+    tbl.append(df)
+
+    orphaned_file = Path(tbl.location()) / "orphan.txt"
+
+    orphaned_file.touch()
+    assert orphaned_file.exists()
+
+    tbl.delete_orphaned_files()
+    assert not orphaned_file.exists()