@@ -46,6 +46,15 @@ class InspectTable:
4646 tbl : Table
4747
4848 def __init__ (self , tbl : Table ) -> None :
49+ """Initialize the InspectTable helper.
50+
51+ Args:
52+ tbl: The Iceberg :class:`~pyiceberg.table.Table` whose metadata
53+ will be inspected.
54+
55+ Raises:
56+ ModuleNotFoundError: If PyArrow is not installed.
57+ """
4958 self .tbl = tbl
5059
5160 try :
@@ -54,6 +63,19 @@ def __init__(self, tbl: Table) -> None:
5463 raise ModuleNotFoundError ("For metadata operations PyArrow needs to be installed" ) from e
5564
5665 def _get_snapshot (self , snapshot_id : int | None = None ) -> Snapshot :
66+ """Return the snapshot identified by *snapshot_id*, or the current snapshot.
67+
68+ Args:
69+ snapshot_id: The snapshot ID to look up. When ``None`` the current
70+ table snapshot is returned.
71+
72+ Returns:
73+ The requested :class:`~pyiceberg.table.snapshots.Snapshot`.
74+
75+ Raises:
76+ ValueError: If *snapshot_id* does not exist, or if the table has no
77+ snapshots at all.
78+ """
5779 if snapshot_id is not None :
5880 if snapshot := self .tbl .metadata .snapshot_by_id (snapshot_id ):
5981 return snapshot
@@ -66,6 +88,26 @@ def _get_snapshot(self, snapshot_id: int | None = None) -> Snapshot:
6688 raise ValueError ("Cannot get a snapshot as the table does not have any." )
6789
6890 def snapshots (self ) -> pa .Table :
91+ """Return all snapshots of the table as a PyArrow table.
92+
93+ Each row represents one snapshot and includes the commit timestamp,
94+ snapshot ID, parent snapshot ID, the write operation that produced the
95+ snapshot (e.g. ``append``, ``overwrite``, ``delete``), the path to the
96+ manifest list file, and a map of additional summary properties.
97+
98+ Returns:
99+ A :class:`pyarrow.Table` with schema::
100+
101+ committed_at timestamp[ms, tz=UTC] not null
102+ snapshot_id int64 not null
103+ parent_id int64 nullable
104+ operation string nullable
105+ manifest_list string not null
106+ summary map<string, string> nullable
107+
108+ Example:
109+ >>> tbl.inspect.snapshots().to_pandas()
110+ """
69111 import pyarrow as pa
70112
71113 snapshots_schema = pa .schema (
@@ -104,9 +146,33 @@ def snapshots(self) -> pa.Table:
104146 )
105147
106148 def entries (self , snapshot_id : int | None = None ) -> pa .Table :
107- import pyarrow as pa
149+ """Return all manifest entries for a snapshot as a PyArrow table.
108150
109- from pyiceberg .io .pyarrow import schema_to_pyarrow
151+ Each row represents one manifest entry (a single data or delete file
152+ tracked within a manifest). Readable per-column metrics (lower/upper
153+ bounds, null counts, etc.) are decoded from their binary representation
154+ into native Python types.
155+
156+ Args:
157+ snapshot_id: The snapshot to inspect. Defaults to the current
158+ snapshot when ``None``.
159+
160+ Returns:
161+ A :class:`pyarrow.Table` with schema::
162+
163+ status int8 not null (0=EXISTING, 1=ADDED, 2=DELETED)
164+ snapshot_id int64 not null
165+ sequence_number int64 not null
166+ file_sequence_number int64 not null
167+ data_file struct not null (content, file_path, format, partition, metrics…)
168+ readable_metrics struct nullable (one sub-struct per column)
169+
170+ Raises:
171+ ValueError: If *snapshot_id* does not exist or the table has no snapshots.
172+
173+ Example:
174+ >>> tbl.inspect.entries().to_pandas()
175+ """
110176
111177 schema = self .tbl .metadata .schema ()
112178
@@ -231,6 +297,21 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
231297 )
232298
233299 def refs (self ) -> pa .Table :
300+ """Return all named references (branches and tags) of the table.
301+
302+ Returns:
303+ A :class:`pyarrow.Table` with schema::
304+
305+ name string not null
306+ type dictionary<int32,str> not null ("branch" or "tag")
307+ snapshot_id int64 not null
308+ max_reference_age_in_ms int64 nullable
309+ min_snapshots_to_keep int32 nullable
310+ max_snapshot_age_in_ms int64 nullable
311+
312+ Example:
313+ >>> tbl.inspect.refs().to_pandas()
314+ """
234315 import pyarrow as pa
235316
236317 ref_schema = pa .schema (
@@ -266,10 +347,38 @@ def partitions(
266347 row_filter : str | BooleanExpression = ALWAYS_TRUE ,
267348 case_sensitive : bool = True ,
268349 ) -> pa .Table :
269- import pyarrow as pa
270-
271- from pyiceberg .io .pyarrow import schema_to_pyarrow
272- from pyiceberg .table import DataScan
350+ """Return partition-level statistics for a snapshot.
351+
352+ Aggregates record counts, file counts, total data file sizes and delete
353+ file information per distinct partition value. When a *row_filter* is
354+ supplied only the partitions that could contain matching rows are
355+ included.
356+
357+ Args:
358+ snapshot_id: The snapshot to inspect. Defaults to the current
359+ snapshot when ``None``.
360+ row_filter: A predicate (as a string expression or
361+ :class:`~pyiceberg.expressions.BooleanExpression`) used to
362+ prune which partitions are returned. Defaults to
363+ :data:`ALWAYS_TRUE` (all partitions).
364+ case_sensitive: Whether column name matching in *row_filter* is
365+ case-sensitive. Defaults to ``True``.
366+
367+ Returns:
368+ A :class:`pyarrow.Table` with one row per distinct partition.
369+ The schema includes ``partition``, ``spec_id``, ``record_count``,
370+ ``file_count``, ``total_data_file_size_in_bytes``,
371+ ``position_delete_record_count``, ``position_delete_file_count``,
372+ ``equality_delete_record_count``, ``equality_delete_file_count``,
373+ ``last_updated_at``, and ``last_updated_snapshot_id``.
374+
375+ Raises:
376+ ValueError: If *snapshot_id* does not exist or the table has no
377+ snapshots.
378+
379+ Example:
380+ >>> tbl.inspect.partitions().to_pandas()
381+ """
273382
274383 table_schema = pa .schema (
275384 [
@@ -476,12 +585,41 @@ def _partition_summaries_to_rows(
476585 )
477586
478587 def manifests (self ) -> pa .Table :
588+ """Return the manifest files for the current snapshot.
589+
590+ Each row describes one manifest file referenced by the current snapshot,
591+ including file counts (added, existing, deleted) for both data files
592+ and delete files, as well as per-partition bound summaries.
593+
594+ Returns:
595+ A :class:`pyarrow.Table` — see :meth:`_get_manifests_schema` for
596+ the full column list.
597+
598+ Example:
599+ >>> tbl.inspect.manifests().to_pandas()
600+ """
479601 return self ._generate_manifests_table (self .tbl .current_snapshot ())
480602
481603 def metadata_log_entries (self ) -> pa .Table :
482- import pyarrow as pa
604+ """Return the metadata log for the table.
605+
606+ The metadata log records every metadata file that has been the current
607+ metadata for the table, along with the snapshot that was current when
608+ each metadata file was written. The most recent entry corresponds to
609+ the current metadata location.
610+
611+ Returns:
612+ A :class:`pyarrow.Table` with schema::
613+
614+ timestamp timestamp[ms] not null
615+ file string not null
616+ latest_snapshot_id int64 nullable
617+ latest_schema_id int32 nullable
618+ latest_sequence_number int64 nullable
483619
484- from pyiceberg .table .snapshots import MetadataLogEntry
620+ Example:
621+ >>> tbl.inspect.metadata_log_entries().to_pandas()
622+ """
485623
486624 table_schema = pa .schema (
487625 [
@@ -515,6 +653,25 @@ def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> dict[str, Any
515653 )
516654
517655 def history (self ) -> pa .Table :
656+ """Return the snapshot history of the table.
657+
658+ Each row in the result corresponds to a snapshot-log entry, i.e. a
659+ point in time when a snapshot became the current snapshot of the table.
660+ The ``is_current_ancestor`` column is ``True`` when the snapshot is on
661+ the ancestry chain of the current snapshot (useful for detecting
662+ expired or replaced snapshots after branch operations).
663+
664+ Returns:
665+ A :class:`pyarrow.Table` with schema::
666+
667+ made_current_at timestamp[ms, tz=UTC] not null
668+ snapshot_id int64 not null
669+ parent_id int64 nullable
670+ is_current_ancestor bool not null
671+
672+ Example:
673+ >>> tbl.inspect.history().to_pandas()
674+ """
518675 import pyarrow as pa
519676
520677 history_schema = pa .schema (
@@ -678,15 +835,88 @@ def _files(self, snapshot_id: int | None = None, data_file_filter: set[DataFileC
678835 return pa .concat_tables (results )
679836
680837 def files (self , snapshot_id : int | None = None ) -> pa .Table :
838+ """Return all data and delete files tracked by a snapshot.
839+
840+ Args:
841+ snapshot_id: The snapshot to inspect. Defaults to the current
842+ snapshot when ``None``.
843+
844+ Returns:
845+ A :class:`pyarrow.Table` with one row per file — see
846+ :meth:`_get_files_schema` for the full schema. The ``content``
847+ column distinguishes data files (``0``) from position-delete files
848+ (``1``) and equality-delete files (``2``).
849+
850+ Raises:
851+ ValueError: If *snapshot_id* does not exist or the table has no
852+ snapshots.
853+
854+ Example:
855+ >>> tbl.inspect.files().to_pandas()
856+ """
681857 return self ._files (snapshot_id )
682858
683859 def data_files (self , snapshot_id : int | None = None ) -> pa .Table :
860+ """Return only data files tracked by a snapshot.
861+
862+ Convenience wrapper around :meth:`files` that filters to rows where
863+ ``content == 0`` (``DATA``).
864+
865+ Args:
866+ snapshot_id: The snapshot to inspect. Defaults to the current
867+ snapshot when ``None``.
868+
869+ Returns:
870+ A :class:`pyarrow.Table` — same schema as :meth:`files`.
871+
872+ Raises:
873+ ValueError: If *snapshot_id* does not exist or the table has no
874+ snapshots.
875+
876+ Example:
877+ >>> tbl.inspect.data_files().to_pandas()
878+ """
684879 return self ._files (snapshot_id , {DataFileContent .DATA })
685880
686881 def delete_files (self , snapshot_id : int | None = None ) -> pa .Table :
882+ """Return only delete files (position and equality) tracked by a snapshot.
883+
884+ Convenience wrapper around :meth:`files` that filters to rows where
885+ ``content`` is ``1`` (``POSITION_DELETES``) or ``2``
886+ (``EQUALITY_DELETES``).
887+
888+ Args:
889+ snapshot_id: The snapshot to inspect. Defaults to the current
890+ snapshot when ``None``.
891+
892+ Returns:
893+ A :class:`pyarrow.Table` — same schema as :meth:`files`.
894+
895+ Raises:
896+ ValueError: If *snapshot_id* does not exist or the table has no
897+ snapshots.
898+
899+ Example:
900+ >>> tbl.inspect.delete_files().to_pandas()
901+ """
687902 return self ._files (snapshot_id , {DataFileContent .POSITION_DELETES , DataFileContent .EQUALITY_DELETES })
688903
689904 def all_manifests (self ) -> pa .Table :
905+ """Return manifest files across *all* snapshots of the table.
906+
907+ Unlike :meth:`manifests`, which only covers the current snapshot, this
908+ method collects manifests from every snapshot and adds a
909+ ``reference_snapshot_id`` column so that each row can be traced back to
910+ the snapshot it belongs to.
911+
912+ Returns:
913+ A :class:`pyarrow.Table` with the same columns as :meth:`manifests`
914+ plus ``reference_snapshot_id int64 not null``. Returns an empty
915+ table when the table has no snapshots.
916+
917+ Example:
918+ >>> tbl.inspect.all_manifests().to_pandas()
919+ """
690920 import pyarrow as pa
691921
692922 snapshots = self .tbl .snapshots ()
@@ -718,10 +948,45 @@ def _all_files(self, data_file_filter: set[DataFileContent] | None = None) -> pa
718948 return pa .concat_tables (file_lists )
719949
720950 def all_files (self ) -> pa .Table :
951+ """Return all files (data and delete) across *all* snapshots.
952+
953+ De-duplicates manifests by path so that files shared between snapshots
954+ are only emitted once. Use :meth:`files` when you only need the
955+ current snapshot.
956+
957+ Returns:
958+ A :class:`pyarrow.Table` — same schema as :meth:`files`. Returns
959+ an empty table when the table has no snapshots.
960+
961+ Example:
962+ >>> tbl.inspect.all_files().to_pandas()
963+ """
721964 return self ._all_files ()
722965
723966 def all_data_files (self ) -> pa .Table :
967+ """Return all data files across *all* snapshots.
968+
969+ Convenience wrapper around :meth:`all_files` that filters to data files
970+ only (``content == 0``).
971+
972+ Returns:
973+ A :class:`pyarrow.Table` — same schema as :meth:`files`.
974+
975+ Example:
976+ >>> tbl.inspect.all_data_files().to_pandas()
977+ """
724978 return self ._all_files ({DataFileContent .DATA })
725979
726980 def all_delete_files (self ) -> pa .Table :
981+ """Return all delete files (position and equality) across *all* snapshots.
982+
983+ Convenience wrapper around :meth:`all_files` that filters to delete
984+ files only (``content`` is ``1`` or ``2``).
985+
986+ Returns:
987+ A :class:`pyarrow.Table` — same schema as :meth:`files`.
988+
989+ Example:
990+ >>> tbl.inspect.all_delete_files().to_pandas()
991+ """
727992 return self ._all_files ({DataFileContent .POSITION_DELETES , DataFileContent .EQUALITY_DELETES })
0 commit comments