2525from typing import (
2626 TYPE_CHECKING ,
2727 Any ,
28- AsyncIterator ,
2928 Iterable ,
30- Iterator ,
3129 Literal ,
3230 Optional ,
3331 Union ,
4442from datafusion ._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
4543from datafusion .expr import Expr , SortExpr , sort_or_default
4644from datafusion .plan import ExecutionPlan , LogicalPlan
47- from datafusion .record_batch import RecordBatch , RecordBatchStream
45+ from datafusion .record_batch import RecordBatchStream
4846
4947if TYPE_CHECKING :
5048 import pathlib
@@ -291,9 +289,6 @@ def __init__(
291289class DataFrame :
292290 """Two dimensional table representation of data.
293291
294- DataFrame objects are iterable; iterating over a DataFrame yields
295- :class:`pyarrow.RecordBatch` instances lazily.
296-
297292 See :ref:`user_guide_concepts` in the online documentation for more information.
298293 """
299294
@@ -310,7 +305,7 @@ def into_view(self) -> pa.Table:
310305 return self .df .into_view ()
311306
312307 def __getitem__ (self , key : str | list [str ]) -> DataFrame :
313- """Return a new :py:class: `DataFrame` with the specified column or columns.
308+ """Return a new :py:class`DataFrame` with the specified column or columns.
314309
315310 Args:
316311 key: Column name or list of column names to select.
@@ -1031,10 +1026,6 @@ def execute_stream(self) -> RecordBatchStream:
10311026 """
10321027 return RecordBatchStream (self .df .execute_stream ())
10331028
1034- def to_record_batch_stream (self ) -> RecordBatchStream :
1035- """Return a :class:`RecordBatchStream` executing this DataFrame."""
1036- return self .execute_stream ()
1037-
10381029 def execute_stream_partitioned (self ) -> list [RecordBatchStream ]:
10391030 """Executes this DataFrame and returns a stream for each partition.
10401031
@@ -1044,15 +1035,6 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]:
10441035 streams = self .df .execute_stream_partitioned ()
10451036 return [RecordBatchStream (rbs ) for rbs in streams ]
10461037
1047- def to_record_batch_stream (self ) -> RecordBatchStream :
1048- """Return a :py:class:`RecordBatchStream` over this DataFrame's results.
1049-
1050- Returns:
1051- A ``RecordBatchStream`` representing the lazily generated record
1052- batches for this DataFrame.
1053- """
1054- return self .execute_stream ()
1055-
10561038 def to_pandas (self ) -> pd .DataFrame :
10571039 """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame.
10581040
@@ -1116,33 +1098,21 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
11161098 return DataFrame (self .df .unnest_columns (columns , preserve_nulls = preserve_nulls ))
11171099
11181100 def __arrow_c_stream__ (self , requested_schema : object | None = None ) -> object :
1119- """Export the DataFrame as an Arrow C Stream.
1101+ """Export an Arrow PyCapsule Stream.
11201102
1121- The DataFrame is executed using DataFusion's streaming APIs and exposed via
1122- Arrow's C Stream interface. Record batches are produced incrementally, so the
1123- full result set is never materialized in memory. When ``requested_schema`` is
1124- provided, only straightforward projections such as column selection or
1125- reordering are applied.
1103+ This will execute and collect the DataFrame. We will attempt to respect the
1104+ requested schema, but only trivial transformations will be applied such as only
1105+ returning the fields listed in the requested schema if their data types match
1106+ those in the DataFrame.
11261107
11271108 Args:
11281109 requested_schema: Attempt to provide the DataFrame using this schema.
11291110
11301111 Returns:
1131- Arrow PyCapsule object representing an ``ArrowArrayStream`` .
1112+ Arrow PyCapsule object.
11321113 """
1133- # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
1134- # ``execute_stream_partitioned`` under the hood to stream batches while
1135- # preserving the original partition order.
11361114 return self .df .__arrow_c_stream__ (requested_schema )
11371115
1138- def __iter__ (self ) -> Iterator [RecordBatch ]:
1139- """Return an iterator over this DataFrame's record batches."""
1140- return iter (self .to_record_batch_stream ())
1141-
1142- def __aiter__ (self ) -> AsyncIterator [RecordBatch ]:
1143- """Return an async iterator over this DataFrame's record batches."""
1144- return self .to_record_batch_stream ().__aiter__ ()
1145-
11461116 def transform (self , func : Callable [..., DataFrame ], * args : Any ) -> DataFrame :
11471117 """Apply a function to the current DataFrame which returns another DataFrame.
11481118
0 commit comments