Skip to content

Commit b45ac7f

Browse files
committed
Merge remote-tracking branch 'apache/main' into arrow-avro
2 parents bb0325f + f30b85c commit b45ac7f

105 files changed

Lines changed: 3614 additions & 666 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Cargo.lock

Lines changed: 19 additions & 43 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-examples/examples/query_planning/expr_api.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ fn simplify_demo() -> Result<()> {
175175
// the ExecutionProps carries information needed to simplify
176176
// expressions, such as the current time (to evaluate `now()`
177177
// correctly)
178-
let context = SimplifyContext::default()
178+
let context = SimplifyContext::builder()
179179
.with_schema(schema)
180-
.with_current_time();
180+
.with_current_time()
181+
.build();
181182
let simplifier = ExprSimplifier::new(context);
182183

183184
// And then call the simplify_expr function:
@@ -192,9 +193,10 @@ fn simplify_demo() -> Result<()> {
192193

193194
// here are some other examples of what DataFusion is capable of
194195
let schema = Schema::new(vec![make_field("i", DataType::Int64)]).to_dfschema_ref()?;
195-
let context = SimplifyContext::default()
196+
let context = SimplifyContext::builder()
196197
.with_schema(Arc::clone(&schema))
197-
.with_current_time();
198+
.with_current_time()
199+
.build();
198200
let simplifier = ExprSimplifier::new(context);
199201

200202
// basic arithmetic simplification
@@ -554,9 +556,10 @@ fn type_coercion_demo() -> Result<()> {
554556
assert!(physical_expr.evaluate(&batch).is_ok());
555557

556558
// 2. Type coercion with `ExprSimplifier::coerce`.
557-
let context = SimplifyContext::default()
559+
let context = SimplifyContext::builder()
558560
.with_schema(Arc::new(df_schema.clone()))
559-
.with_current_time();
561+
.with_current_time()
562+
.build();
560563
let simplifier = ExprSimplifier::new(context);
561564
let coerced_expr = simplifier.coerce(expr.clone(), &df_schema)?;
562565
let physical_expr = datafusion::physical_expr::create_physical_expr(

datafusion/catalog/src/table.rs

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ pub trait TableProvider: Debug + Sync + Send {
8484
None
8585
}
8686

87-
/// Create an [`ExecutionPlan`] for scanning the table with optionally
88-
/// specified `projection`, `filter` and `limit`, described below.
87+
/// Create an [`ExecutionPlan`] for scanning the table with optional
88+
/// `projection`, `filter`, and `limit`, described below.
8989
///
90-
/// The `ExecutionPlan` is responsible scanning the datasource's
90+
/// The returned `ExecutionPlan` is responsible for scanning the datasource's
9191
/// partitions in a streaming, parallelized fashion.
9292
///
9393
/// # Projection
@@ -96,33 +96,30 @@ pub trait TableProvider: Debug + Sync + Send {
9696
/// specified. The projection is a set of indexes of the fields in
9797
/// [`Self::schema`].
9898
///
99-
/// DataFusion provides the projection to scan only the columns actually
100-
/// used in the query to improve performance, an optimization called
101-
/// "Projection Pushdown". Some datasources, such as Parquet, can use this
102-
/// information to go significantly faster when only a subset of columns is
103-
/// required.
99+
/// DataFusion provides the projection so the scan reads only the columns
100+
/// actually used in the query, an optimization called "Projection
101+
/// Pushdown". Some datasources, such as Parquet, can use this information
102+
/// to go significantly faster when only a subset of columns is required.
104103
///
105104
/// # Filters
106105
///
107106
/// A list of boolean filter [`Expr`]s to evaluate *during* the scan, in the
108107
/// manner specified by [`Self::supports_filters_pushdown`]. Only rows for
109-
/// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
110-
/// expressions are `AND`ed together).
108+
/// which *all* of the `Expr`s evaluate to `true` must be returned (that is,
109+
/// the expressions are `AND`ed together).
111110
///
112-
/// To enable filter pushdown you must override
113-
/// [`Self::supports_filters_pushdown`] as the default implementation does
114-
/// not and `filters` will be empty.
111+
/// To enable filter pushdown, override
112+
/// [`Self::supports_filters_pushdown`]. The default implementation does not
113+
/// push down filters, and `filters` will be empty.
115114
///
116-
/// DataFusion pushes filtering into the scans whenever possible
117-
/// ("Filter Pushdown"), and depending on the format and the
118-
/// implementation of the format, evaluating the predicate during the scan
119-
/// can increase performance significantly.
115+
/// DataFusion pushes filters into scans whenever possible ("Filter
116+
/// Pushdown"). Depending on the data format and implementation, evaluating
117+
/// predicates during the scan can significantly improve performance.
120118
///
121119
/// ## Note: Some columns may appear *only* in Filters
122120
///
123-
/// In certain cases, a query may only use a certain column in a Filter that
124-
/// has been completely pushed down to the scan. In this case, the
125-
/// projection will not contain all the columns found in the filter
121+
/// In some cases, a query may use a column only in a filter and the
122+
/// projection will not contain all columns referenced by the filter
126123
/// expressions.
127124
///
128125
/// For example, given the query `SELECT t.a FROM t WHERE t.b > 5`,
@@ -154,15 +151,40 @@ pub trait TableProvider: Debug + Sync + Send {
154151
///
155152
/// # Limit
156153
///
157-
/// If `limit` is specified, must only produce *at least* this many rows,
158-
/// (though it may return more). Like Projection Pushdown and Filter
159-
/// Pushdown, DataFusion pushes `LIMIT`s as far down in the plan as
160-
/// possible, called "Limit Pushdown" as some sources can use this
161-
/// information to improve their performance. Note that if there are any
162-
/// Inexact filters pushed down, the LIMIT cannot be pushed down. This is
163-
/// because inexact filters do not guarantee that every filtered row is
164-
/// removed, so applying the limit could lead to too few rows being available
165-
/// to return as a final result.
154+
/// If `limit` is specified, the scan must produce *at least* this many
155+
/// rows, though it may return more. Like Projection Pushdown and Filter
156+
/// Pushdown, DataFusion pushes `LIMIT`s as far down in the plan as
157+
/// possible. This is called "Limit Pushdown", and some sources can use the
158+
/// information to improve performance.
159+
///
160+
/// Note: If any pushed-down filters are `Inexact`, the `LIMIT` cannot be
161+
/// pushed down. Inexact filters do not guarantee that every filtered row is
162+
/// removed, so applying the limit could leave too few rows to return in the
163+
/// final result.
164+
///
165+
/// # Evaluation Order
166+
///
167+
/// The logical evaluation order is `filters`, then `limit`, then
168+
/// `projection`.
169+
///
170+
/// Note that `limit` applies to the filtered result, not to the unfiltered
171+
/// input, and `projection` affects only which columns are returned, not
172+
/// which rows qualify.
173+
///
174+
/// For example, if a scan receives:
175+
///
176+
/// - `projection = [a]`
177+
/// - `filters = [b > 5]`
178+
/// - `limit = Some(3)`
179+
///
180+
/// It must logically produce results equivalent to:
181+
///
182+
/// ```text
183+
/// PROJECTION a (LIMIT 3 (SCAN WHERE b > 5))
184+
/// ```
185+
///
186+
/// As noted above, columns referenced only by pushed-down filters may be
187+
/// absent from `projection`.
166188
async fn scan(
167189
&self,
168190
state: &dyn Session,

datafusion/core/src/execution/context/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1488,12 +1488,13 @@ impl SessionContext {
14881488
})?;
14891489

14901490
let state = self.state.read();
1491-
let context = SimplifyContext::default()
1491+
let context = SimplifyContext::builder()
14921492
.with_schema(Arc::clone(prepared.plan.schema()))
14931493
.with_config_options(Arc::clone(state.config_options()))
14941494
.with_query_execution_start_time(
14951495
state.execution_props().query_execution_start_time,
1496-
);
1496+
)
1497+
.build();
14971498
let simplifier = ExprSimplifier::new(context);
14981499

14991500
// Only allow literals as parameters for now.

datafusion/core/src/execution/session_state.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -743,12 +743,13 @@ impl SessionState {
743743
df_schema: &DFSchema,
744744
) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
745745
let config_options = self.config_options();
746-
let simplify_context = SimplifyContext::default()
746+
let simplify_context = SimplifyContext::builder()
747747
.with_schema(Arc::new(df_schema.clone()))
748748
.with_config_options(Arc::clone(config_options))
749749
.with_query_execution_start_time(
750750
self.execution_props().query_execution_start_time,
751-
);
751+
)
752+
.build();
752753
let simplifier = ExprSimplifier::new(simplify_context);
753754
// apply type coercion here to ensure types match
754755
let mut expr = simplifier.coerce(expr, df_schema)?;
@@ -1835,11 +1836,12 @@ impl ContextProvider for SessionContextProvider<'_> {
18351836
.get(name)
18361837
.cloned()
18371838
.ok_or_else(|| plan_datafusion_err!("table function '{name}' not found"))?;
1838-
let simplify_context = SimplifyContext::default()
1839+
let simplify_context = SimplifyContext::builder()
18391840
.with_config_options(Arc::clone(self.state.config_options()))
18401841
.with_query_execution_start_time(
18411842
self.state.execution_props().query_execution_start_time,
1842-
);
1843+
)
1844+
.build();
18431845
let simplifier = ExprSimplifier::new(simplify_context);
18441846
let schema = DFSchema::empty();
18451847
let args = args

datafusion/core/src/test_util/parquet.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ impl TestParquetFile {
166166
let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?;
167167

168168
// run coercion on the filters to coerce types etc.
169-
let context = SimplifyContext::default().with_schema(Arc::clone(&df_schema));
169+
let context = SimplifyContext::builder()
170+
.with_schema(Arc::clone(&df_schema))
171+
.build();
170172
if let Some(filter) = maybe_filter {
171173
let simplifier = ExprSimplifier::new(context);
172174
let filter = simplifier.coerce(filter, &df_schema).unwrap();

0 commit comments

Comments
 (0)