Skip to content

Commit b7f3c49

Browse files
author
Ritesh Chandra
committed
docs: add docstrings to InspectTable public methods
Closes #1191 Added Google-style docstrings to all public methods of InspectTable in pyiceberg/table/inspect.py: - __init__: parameters and ModuleNotFoundError - snapshots, entries, refs, partitions: full schema descriptions + examples - manifests, metadata_log_entries, history: schema + examples - files, data_files, delete_files: content column semantics + examples - all_manifests, all_files, all_data_files, all_delete_files: cross-snapshot de-duplication behaviour documented Private helpers (_get_snapshot, _get_files_from_manifest, _generate_manifests_table) also received brief docstrings for maintainability.
1 parent b8ffdd3 commit b7f3c49

1 file changed

Lines changed: 273 additions & 8 deletions

File tree

pyiceberg/table/inspect.py

Lines changed: 273 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ class InspectTable:
4646
tbl: Table
4747

4848
def __init__(self, tbl: Table) -> None:
49+
"""Initialize the InspectTable helper.
50+
51+
Args:
52+
tbl: The Iceberg :class:`~pyiceberg.table.Table` whose metadata
53+
will be inspected.
54+
55+
Raises:
56+
ModuleNotFoundError: If PyArrow is not installed.
57+
"""
4958
self.tbl = tbl
5059

5160
try:
@@ -54,6 +63,19 @@ def __init__(self, tbl: Table) -> None:
5463
raise ModuleNotFoundError("For metadata operations PyArrow needs to be installed") from e
5564

5665
def _get_snapshot(self, snapshot_id: int | None = None) -> Snapshot:
66+
"""Return the snapshot identified by *snapshot_id*, or the current snapshot.
67+
68+
Args:
69+
snapshot_id: The snapshot ID to look up. When ``None`` the current
70+
table snapshot is returned.
71+
72+
Returns:
73+
The requested :class:`~pyiceberg.table.snapshots.Snapshot`.
74+
75+
Raises:
76+
ValueError: If *snapshot_id* does not exist, or if the table has no
77+
snapshots at all.
78+
"""
5779
if snapshot_id is not None:
5880
if snapshot := self.tbl.metadata.snapshot_by_id(snapshot_id):
5981
return snapshot
@@ -66,6 +88,26 @@ def _get_snapshot(self, snapshot_id: int | None = None) -> Snapshot:
6688
raise ValueError("Cannot get a snapshot as the table does not have any.")
6789

6890
def snapshots(self) -> pa.Table:
91+
"""Return all snapshots of the table as a PyArrow table.
92+
93+
Each row represents one snapshot and includes the commit timestamp,
94+
snapshot ID, parent snapshot ID, the write operation that produced the
95+
snapshot (e.g. ``append``, ``overwrite``, ``delete``), the path to the
96+
manifest list file, and a map of additional summary properties.
97+
98+
Returns:
99+
A :class:`pyarrow.Table` with schema::
100+
101+
committed_at timestamp[ms, tz=UTC] not null
102+
snapshot_id int64 not null
103+
parent_id int64 nullable
104+
operation string nullable
105+
manifest_list string not null
106+
summary map<string, string> nullable
107+
108+
Example:
109+
>>> tbl.inspect.snapshots().to_pandas()
110+
"""
69111
import pyarrow as pa
70112

71113
snapshots_schema = pa.schema(
@@ -104,9 +146,33 @@ def snapshots(self) -> pa.Table:
104146
)
105147

106148
def entries(self, snapshot_id: int | None = None) -> pa.Table:
107-
import pyarrow as pa
149+
"""Return all manifest entries for a snapshot as a PyArrow table.
108150
109-
from pyiceberg.io.pyarrow import schema_to_pyarrow
151+
Each row represents one manifest entry (a single data or delete file
152+
tracked within a manifest). Readable per-column metrics (lower/upper
153+
bounds, null counts, etc.) are decoded from their binary representation
154+
into native Python types.
155+
156+
Args:
157+
snapshot_id: The snapshot to inspect. Defaults to the current
158+
snapshot when ``None``.
159+
160+
Returns:
161+
A :class:`pyarrow.Table` with schema::
162+
163+
status int8 not null (0=EXISTING, 1=ADDED, 2=DELETED)
164+
snapshot_id int64 not null
165+
sequence_number int64 not null
166+
file_sequence_number int64 not null
167+
data_file struct not null (content, file_path, format, partition, metrics…)
168+
readable_metrics struct nullable (one sub-struct per column)
169+
170+
Raises:
171+
ValueError: If *snapshot_id* does not exist or the table has no snapshots.
172+
173+
Example:
174+
>>> tbl.inspect.entries().to_pandas()
175+
"""
110176

111177
schema = self.tbl.metadata.schema()
112178

@@ -231,6 +297,21 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
231297
)
232298

233299
def refs(self) -> pa.Table:
300+
"""Return all named references (branches and tags) of the table.
301+
302+
Returns:
303+
A :class:`pyarrow.Table` with schema::
304+
305+
name string not null
306+
type dictionary<int32,str> not null ("branch" or "tag")
307+
snapshot_id int64 not null
308+
max_reference_age_in_ms int64 nullable
309+
min_snapshots_to_keep int32 nullable
310+
max_snapshot_age_in_ms int64 nullable
311+
312+
Example:
313+
>>> tbl.inspect.refs().to_pandas()
314+
"""
234315
import pyarrow as pa
235316

236317
ref_schema = pa.schema(
@@ -266,10 +347,38 @@ def partitions(
266347
row_filter: str | BooleanExpression = ALWAYS_TRUE,
267348
case_sensitive: bool = True,
268349
) -> pa.Table:
269-
import pyarrow as pa
270-
271-
from pyiceberg.io.pyarrow import schema_to_pyarrow
272-
from pyiceberg.table import DataScan
350+
"""Return partition-level statistics for a snapshot.
351+
352+
Aggregates record counts, file counts, total data file sizes and delete
353+
file information per distinct partition value. When a *row_filter* is
354+
supplied only the partitions that could contain matching rows are
355+
included.
356+
357+
Args:
358+
snapshot_id: The snapshot to inspect. Defaults to the current
359+
snapshot when ``None``.
360+
row_filter: A predicate (as a string expression or
361+
:class:`~pyiceberg.expressions.BooleanExpression`) used to
362+
prune which partitions are returned. Defaults to
363+
:data:`ALWAYS_TRUE` (all partitions).
364+
case_sensitive: Whether column name matching in *row_filter* is
365+
case-sensitive. Defaults to ``True``.
366+
367+
Returns:
368+
A :class:`pyarrow.Table` with one row per distinct partition.
369+
The schema includes ``partition``, ``spec_id``, ``record_count``,
370+
``file_count``, ``total_data_file_size_in_bytes``,
371+
``position_delete_record_count``, ``position_delete_file_count``,
372+
``equality_delete_record_count``, ``equality_delete_file_count``,
373+
``last_updated_at``, and ``last_updated_snapshot_id``.
374+
375+
Raises:
376+
ValueError: If *snapshot_id* does not exist or the table has no
377+
snapshots.
378+
379+
Example:
380+
>>> tbl.inspect.partitions().to_pandas()
381+
"""
273382

274383
table_schema = pa.schema(
275384
[
@@ -476,12 +585,41 @@ def _partition_summaries_to_rows(
476585
)
477586

478587
def manifests(self) -> pa.Table:
588+
"""Return the manifest files for the current snapshot.
589+
590+
Each row describes one manifest file referenced by the current snapshot,
591+
including file counts (added, existing, deleted) for both data files
592+
and delete files, as well as per-partition bound summaries.
593+
594+
Returns:
595+
A :class:`pyarrow.Table` — see :meth:`_get_manifests_schema` for
596+
the full column list.
597+
598+
Example:
599+
>>> tbl.inspect.manifests().to_pandas()
600+
"""
479601
return self._generate_manifests_table(self.tbl.current_snapshot())
480602

481603
def metadata_log_entries(self) -> pa.Table:
482-
import pyarrow as pa
604+
"""Return the metadata log for the table.
605+
606+
The metadata log records every metadata file that has been the current
607+
metadata for the table, along with the snapshot that was current when
608+
each metadata file was written. The most recent entry corresponds to
609+
the current metadata location.
610+
611+
Returns:
612+
A :class:`pyarrow.Table` with schema::
613+
614+
timestamp timestamp[ms] not null
615+
file string not null
616+
latest_snapshot_id int64 nullable
617+
latest_schema_id int32 nullable
618+
latest_sequence_number int64 nullable
483619
484-
from pyiceberg.table.snapshots import MetadataLogEntry
620+
Example:
621+
>>> tbl.inspect.metadata_log_entries().to_pandas()
622+
"""
485623

486624
table_schema = pa.schema(
487625
[
@@ -515,6 +653,25 @@ def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> dict[str, Any
515653
)
516654

517655
def history(self) -> pa.Table:
656+
"""Return the snapshot history of the table.
657+
658+
Each row in the result corresponds to a snapshot-log entry, i.e. a
659+
point in time when a snapshot became the current snapshot of the table.
660+
The ``is_current_ancestor`` column is ``True`` when the snapshot is on
661+
the ancestry chain of the current snapshot (useful for detecting
662+
expired or replaced snapshots after branch operations).
663+
664+
Returns:
665+
A :class:`pyarrow.Table` with schema::
666+
667+
made_current_at timestamp[ms, tz=UTC] not null
668+
snapshot_id int64 not null
669+
parent_id int64 nullable
670+
is_current_ancestor bool not null
671+
672+
Example:
673+
>>> tbl.inspect.history().to_pandas()
674+
"""
518675
import pyarrow as pa
519676

520677
history_schema = pa.schema(
@@ -678,15 +835,88 @@ def _files(self, snapshot_id: int | None = None, data_file_filter: set[DataFileC
678835
return pa.concat_tables(results)
679836

680837
def files(self, snapshot_id: int | None = None) -> pa.Table:
838+
"""Return all data and delete files tracked by a snapshot.
839+
840+
Args:
841+
snapshot_id: The snapshot to inspect. Defaults to the current
842+
snapshot when ``None``.
843+
844+
Returns:
845+
A :class:`pyarrow.Table` with one row per file — see
846+
:meth:`_get_files_schema` for the full schema. The ``content``
847+
column distinguishes data files (``0``) from position-delete files
848+
(``1``) and equality-delete files (``2``).
849+
850+
Raises:
851+
ValueError: If *snapshot_id* does not exist or the table has no
852+
snapshots.
853+
854+
Example:
855+
>>> tbl.inspect.files().to_pandas()
856+
"""
681857
return self._files(snapshot_id)
682858

683859
def data_files(self, snapshot_id: int | None = None) -> pa.Table:
860+
"""Return only data files tracked by a snapshot.
861+
862+
Convenience wrapper around :meth:`files` that filters to rows where
863+
``content == 0`` (``DATA``).
864+
865+
Args:
866+
snapshot_id: The snapshot to inspect. Defaults to the current
867+
snapshot when ``None``.
868+
869+
Returns:
870+
A :class:`pyarrow.Table` — same schema as :meth:`files`.
871+
872+
Raises:
873+
ValueError: If *snapshot_id* does not exist or the table has no
874+
snapshots.
875+
876+
Example:
877+
>>> tbl.inspect.data_files().to_pandas()
878+
"""
684879
return self._files(snapshot_id, {DataFileContent.DATA})
685880

686881
def delete_files(self, snapshot_id: int | None = None) -> pa.Table:
882+
"""Return only delete files (position and equality) tracked by a snapshot.
883+
884+
Convenience wrapper around :meth:`files` that filters to rows where
885+
``content`` is ``1`` (``POSITION_DELETES``) or ``2``
886+
(``EQUALITY_DELETES``).
887+
888+
Args:
889+
snapshot_id: The snapshot to inspect. Defaults to the current
890+
snapshot when ``None``.
891+
892+
Returns:
893+
A :class:`pyarrow.Table` — same schema as :meth:`files`.
894+
895+
Raises:
896+
ValueError: If *snapshot_id* does not exist or the table has no
897+
snapshots.
898+
899+
Example:
900+
>>> tbl.inspect.delete_files().to_pandas()
901+
"""
687902
return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES})
688903

689904
def all_manifests(self) -> pa.Table:
905+
"""Return manifest files across *all* snapshots of the table.
906+
907+
Unlike :meth:`manifests`, which only covers the current snapshot, this
908+
method collects manifests from every snapshot and adds a
909+
``reference_snapshot_id`` column so that each row can be traced back to
910+
the snapshot it belongs to.
911+
912+
Returns:
913+
A :class:`pyarrow.Table` with the same columns as :meth:`manifests`
914+
plus ``reference_snapshot_id int64 not null``. Returns an empty
915+
table when the table has no snapshots.
916+
917+
Example:
918+
>>> tbl.inspect.all_manifests().to_pandas()
919+
"""
690920
import pyarrow as pa
691921

692922
snapshots = self.tbl.snapshots()
@@ -718,10 +948,45 @@ def _all_files(self, data_file_filter: set[DataFileContent] | None = None) -> pa
718948
return pa.concat_tables(file_lists)
719949

720950
def all_files(self) -> pa.Table:
951+
"""Return all files (data and delete) across *all* snapshots.
952+
953+
De-duplicates manifests by path so that files shared between snapshots
954+
are only emitted once. Use :meth:`files` when you only need the
955+
current snapshot.
956+
957+
Returns:
958+
A :class:`pyarrow.Table` — same schema as :meth:`files`. Returns
959+
an empty table when the table has no snapshots.
960+
961+
Example:
962+
>>> tbl.inspect.all_files().to_pandas()
963+
"""
721964
return self._all_files()
722965

723966
def all_data_files(self) -> pa.Table:
967+
"""Return all data files across *all* snapshots.
968+
969+
Convenience wrapper around :meth:`all_files` that filters to data files
970+
only (``content == 0``).
971+
972+
Returns:
973+
A :class:`pyarrow.Table` — same schema as :meth:`files`.
974+
975+
Example:
976+
>>> tbl.inspect.all_data_files().to_pandas()
977+
"""
724978
return self._all_files({DataFileContent.DATA})
725979

726980
def all_delete_files(self) -> pa.Table:
981+
"""Return all delete files (position and equality) across *all* snapshots.
982+
983+
Convenience wrapper around :meth:`all_files` that filters to delete
984+
files only (``content`` is ``1`` or ``2``).
985+
986+
Returns:
987+
A :class:`pyarrow.Table` — same schema as :meth:`files`.
988+
989+
Example:
990+
>>> tbl.inspect.all_delete_files().to_pandas()
991+
"""
727992
return self._all_files({DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES})

0 commit comments

Comments
 (0)