From 3576d7ee59d8ee886cd041c00a6ea1462786ef9f Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 16:53:03 +0800 Subject: [PATCH 01/32] perf(parquet): add adaptive row filter fallback --- parquet/benches/arrow_reader_row_filter.rs | 386 ++++++- parquet/src/arrow/arrow_reader/filter.rs | 8 + parquet/src/arrow/arrow_reader/metrics.rs | 351 ++++++- parquet/src/arrow/arrow_reader/mod.rs | 451 +++++++- parquet/src/arrow/arrow_reader/read_plan.rs | 701 ++++++++++++- parquet/src/arrow/arrow_reader/selection.rs | 616 ++++++++++- parquet/src/arrow/push_decoder/mod.rs | 391 ++++++- .../arrow/push_decoder/reader_builder/data.rs | 4 + .../arrow/push_decoder/reader_builder/mod.rs | 988 ++++++++++++++++-- .../tests/arrow_reader/row_filter/async.rs | 346 +++++- 10 files changed, 4048 insertions(+), 194 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 2b5a09eebcb3..49cb1d92349e 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -59,13 +59,13 @@ use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::StringViewArray; use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; -use arrow_cast::pretty::pretty_format_batches; use bytes::Bytes; use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt}; use parquet::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowFilter, + RowSelectionPolicy, }; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; @@ -76,6 +76,8 @@ use rand::{Rng, SeedableRng, rngs::StdRng}; use std::ops::Range; use std::sync::Arc; +const COLUMN_NAMES: [&str; 4] = ["int64", "float64", "utf8View", "ts"]; + /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). fn random_string(rng: &mut StdRng) -> String { @@ -189,11 +191,6 @@ const ROW_GROUP_SIZE: usize = 100_000; /// Writes the RecordBatch to an in memory buffer, returning the buffer fn write_parquet_file() -> Vec { let batch = create_record_batch(TOTAL_ROWS); - println!("Batch created with {TOTAL_ROWS} rows, row group size = {ROW_GROUP_SIZE}"); - println!( - "First 100 rows:\n{}", - pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() - ); let schema = batch.schema(); let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) @@ -225,6 +222,44 @@ impl std::fmt::Display for ProjectionCase { } } +#[derive(Clone, Copy)] +enum SyncStrategy { + FullPostFilter, + PushdownAuto, + PushdownSelectors, + PushdownMask, +} + +impl std::fmt::Display for SyncStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SyncStrategy::FullPostFilter => write!(f, "full_post_filter"), + SyncStrategy::PushdownAuto => write!(f, "pushdown_auto"), + SyncStrategy::PushdownSelectors => write!(f, "pushdown_selectors"), + SyncStrategy::PushdownMask => write!(f, "pushdown_mask"), + } + } +} + +#[derive(Clone, Copy)] +enum AsyncStrategy { + FullPostFilter, + PushdownAutoFallback, + PushdownSelectors, + PushdownMask, +} + +impl std::fmt::Display for AsyncStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AsyncStrategy::FullPostFilter => write!(f, "full_post_filter"), + AsyncStrategy::PushdownAutoFallback => write!(f, "pushdown_auto_fallback"), + AsyncStrategy::PushdownSelectors => write!(f, "pushdown_selectors"), + AsyncStrategy::PushdownMask => write!(f, "pushdown_mask"), + } + } +} + /// FilterType encapsulates the different filter comparisons. /// The variants correspond to the different filter patterns. #[derive(Clone, Copy, Debug)] @@ -510,6 +545,251 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { } } +/// Compare full scan plus post-filtering against row-level pushdown strategies. +/// +/// This group is intentionally sync-only and smaller than +/// [`benchmark_filters_and_projections`]. It tracks the cases most likely to +/// inform a future default `Auto` policy: selective random filters, clustered +/// filters, ClickBench-like string filters, and the forced selector strategy +/// that originally motivated apache/arrow-rs#8565. +fn benchmark_sync_strategy_matrix(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let filter_types = [ + FilterType::SelectiveUnclustered, + FilterType::ModeratelySelectiveClustered, + FilterType::ModeratelySelectiveUnclustered, + FilterType::Utf8ViewNonEmpty, + ]; + let strategies = [ + SyncStrategy::FullPostFilter, + SyncStrategy::PushdownAuto, + SyncStrategy::PushdownSelectors, + SyncStrategy::PushdownMask, + ]; + + let mut group = c.benchmark_group("arrow_reader_row_filter_strategy_matrix"); + + for filter_type in filter_types { + for projection_case in [ + ProjectionCase::AllColumns, + ProjectionCase::ExcludeFilterColumn, + ] { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("{filter_type}/{projection_case}"), + strategy.to_string(), + ); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + match strategy { + SyncStrategy::FullPostFilter => benchmark_sync_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ), + SyncStrategy::PushdownAuto => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + } + SyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + } + SyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + } + } + }); + }); + } + } + } +} + +/// Compare async full scan plus post-filtering against async row-level pushdown +/// strategies. This is the matrix that exercises reader `Auto` fallback because +/// the async stream is backed by the push decoder row-group pipeline. +fn benchmark_async_strategy_matrix(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let filter_types = [ + FilterType::SelectiveUnclustered, + FilterType::ModeratelySelectiveClustered, + FilterType::ModeratelySelectiveUnclustered, + FilterType::Utf8ViewNonEmpty, + ]; + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoFallback, + AsyncStrategy::PushdownSelectors, + AsyncStrategy::PushdownMask, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_strategy_matrix"); + + for filter_type in filter_types { + for projection_case in [ + ProjectionCase::AllColumns, + ProjectionCase::ExcludeFilterColumn, + ] { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("{filter_type}/{projection_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ) + .await + } + AsyncStrategy::PushdownAutoFallback => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } + } + } +} + +fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCase) -> Vec { + let filter_columns = filter_type.filter_projection(); + COLUMN_NAMES + .iter() + .enumerate() + .map(|(idx, _)| idx) + .filter(move |idx| { + matches!(projection_case, ProjectionCase::AllColumns) || !filter_columns.contains(idx) + }) + .collect() +} + +fn full_post_filter_read_projection( + filter_type: FilterType, + output_projection: &[usize], +) -> Vec { + let mut read_projection = output_projection.to_vec(); + for filter_idx in filter_type.filter_projection() { + if !read_projection.contains(filter_idx) { + read_projection.push(*filter_idx); + } + } + read_projection.sort_unstable(); + read_projection +} + +fn projection_names(projection: &[usize]) -> Vec<&'static str> { + projection.iter().map(|idx| COLUMN_NAMES[*idx]).collect() +} + +fn row_filter_for(filter_type: FilterType, pred_mask: ProjectionMask) -> RowFilter { + let filter = + ArrowPredicateFn::new( + pred_mask, + move |batch| Ok(filter_type.filter_batch(&batch)?), + ); + RowFilter::new(vec![Box::new(filter)]) +} + /// Use async API async fn benchmark_async_reader( reader: InMemoryReader, @@ -529,6 +809,53 @@ async fn benchmark_async_reader( } } +async fn benchmark_async_reader_with_policy( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, + row_selection_policy: RowSelectionPolicy, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .with_row_selection_policy(row_selection_policy) + .build() + .unwrap(); + while let Some(b) = stream.next().await { + b.unwrap(); // consume the batches, no buffering + } +} + +async fn benchmark_async_reader_post_filter( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: Vec<&'static str>, + filter_type: FilterType, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + /// Like [`benchmark_async_reader`] but also threads `with_limit(limit)` into /// the stream builder. Used by the `LIMIT` benchmark below. async fn benchmark_async_reader_with_limit( @@ -569,6 +896,51 @@ fn benchmark_sync_reader( } } +fn benchmark_sync_reader_with_policy( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, + row_selection_policy: RowSelectionPolicy, +) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .with_row_selection_policy(row_selection_policy) + .build() + .unwrap(); + for b in stream { + b.unwrap(); // consume the batches, no buffering + } +} + +fn benchmark_sync_reader_post_filter( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: Vec<&'static str>, + filter_type: FilterType, +) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + for b in stream { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + /// Adapter to read asynchronously from in memory bytes and always loads the /// metadata with page indexes. #[derive(Debug, Clone)] @@ -696,6 +1068,8 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { criterion_group!( benches, benchmark_filters_and_projections, + benchmark_sync_strategy_matrix, + benchmark_async_strategy_matrix, benchmark_filters_with_limit, ); criterion_main!(benches); diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index 3fd5e1d650be..7538fd0ef526 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -190,6 +190,14 @@ impl RowFilter { pub fn new(predicates: Vec>) -> Self { Self { predicates } } + /// Returns the union of all predicate projections, if there are any predicates + pub(crate) fn union_projection(&self) -> Option { + let mut projection = self.predicates.first()?.projection().clone(); + for predicate in self.predicates.iter().skip(1) { + projection.union(predicate.projection()); + } + Some(projection) + } /// Returns the inner predicates pub fn predicates(&self) -> &Vec> { &self.predicates diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index b36d79586bb3..506c9b738970 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -17,8 +17,12 @@ //! [ArrowReaderMetrics] for collecting metrics about the Arrow reader +use crate::arrow::arrow_reader::selection::{ + FallbackTriggerReason, RowGroupExecutionMode, RowSelectionStrategyDecision, + RowSelectionStrategyReason, +}; use std::sync::Arc; -use std::sync::atomic::AtomicUsize; +use std::sync::atomic::{AtomicUsize, Ordering}; /// This enum represents the state of Arrow reader metrics collection. /// @@ -82,14 +86,145 @@ impl ArrowReaderMetrics { pub fn records_read_from_cache(&self) -> Option { match self { Self::Disabled => None, - Self::Enabled(inner) => Some( - inner - .records_read_from_cache - .load(std::sync::atomic::Ordering::Relaxed), - ), + Self::Enabled(inner) => Some(inner.records_read_from_cache.load(Ordering::Relaxed)), } } + /// Row Selection: number of selected rows recorded in planned selections + pub fn row_selection_selected_rows(&self) -> Option { + self.load(|inner| &inner.row_selection_selected_rows) + } + + /// Row Selection: number of skipped rows recorded in planned selections + pub fn row_selection_skipped_rows(&self) -> Option { + self.load(|inner| &inner.row_selection_skipped_rows) + } + + /// Row Selection: number of non-empty selectors recorded in planned selections + pub fn row_selection_selector_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selector_count) + } + + /// Row Selection: number of selected runs recorded in planned selections + pub fn row_selection_selected_run_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selected_run_count) + } + + /// Row Selection: number of skipped runs recorded in planned selections + pub fn row_selection_skipped_run_count(&self) -> Option { + self.load(|inner| &inner.row_selection_skipped_run_count) + } + + /// Row Selection: number of plans using mask materialization + pub fn row_selection_mask_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_mask_plan_count) + } + + /// Row Selection: number of plans using selector materialization + pub fn row_selection_selector_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selector_plan_count) + } + + /// Row Selection: number of plans forced to masks + pub fn row_selection_forced_mask_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_forced_mask_plan_count) + } + + /// Row Selection: number of plans forced to selectors + pub fn row_selection_forced_selector_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_forced_selector_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for empty selections + pub fn row_selection_auto_mask_empty_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_empty_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for short runs + pub fn row_selection_auto_mask_short_run_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_short_run_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for fragmented selected rows + pub fn row_selection_auto_mask_fragmented_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_fragmented_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for high selected-row ratio + pub fn row_selection_auto_mask_high_ratio_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_high_ratio_plan_count) + } + + /// Row Selection: number of Auto plans choosing selectors for clustered selected rows + pub fn row_selection_auto_selector_clustered_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_selector_clustered_plan_count) + } + + /// Row Selection: number of Auto plans choosing selectors for long runs + pub fn row_selection_auto_selector_long_run_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_selector_long_run_plan_count) + } + + /// Fallback: number of row groups included in the observation window + pub fn fallback_observed_row_group_count(&self) -> Option { + self.load(|inner| &inner.fallback_observed_row_group_count) + } + + /// Fallback: number of row groups executed with pushdown + pub fn fallback_pushdown_row_group_count(&self) -> Option { + self.load(|inner| &inner.fallback_pushdown_row_group_count) + } + + /// Fallback: number of row groups executed with post-filter + pub fn fallback_post_filter_row_group_count(&self) -> Option { + self.load(|inner| &inner.fallback_post_filter_row_group_count) + } + + /// Fallback: number of times fallback was disabled by a forced policy + pub fn fallback_forced_policy_count(&self) -> Option { + self.load(|inner| &inner.fallback_forced_policy_count) + } + + /// Fallback: number of incomplete observation-window decisions + pub fn fallback_observation_incomplete_count(&self) -> Option { + self.load(|inner| &inner.fallback_observation_incomplete_count) + } + + /// Fallback: number of times pushdown remained preferred + pub fn fallback_pushdown_still_preferred_count(&self) -> Option { + self.load(|inner| &inner.fallback_pushdown_still_preferred_count) + } + + /// Fallback: number of high-selectivity no-pruning triggers + pub fn fallback_high_selectivity_no_pruning_count(&self) -> Option { + self.load(|inner| &inner.fallback_high_selectivity_no_pruning_count) + } + + /// Fallback: number of fragmented moderate-selectivity triggers + pub fn fallback_fragmented_moderate_selectivity_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_moderate_selectivity_count) + } + + /// Fallback: number of fragmented high-selectivity materialization triggers + pub fn fallback_fragmented_high_selectivity_materialization_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_high_selectivity_materialization_count) + } + + /// Fallback: number of fragmented high-selectivity output-dominates triggers + pub fn fallback_fragmented_high_selectivity_output_dominates_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_high_selectivity_output_dominates_count) + } + + /// Fallback: number of fragmented high-selectivity cache-miss triggers + pub fn fallback_fragmented_high_selectivity_cache_miss_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_high_selectivity_cache_miss_count) + } + + /// Fallback: number of fragmented high-selectivity cache-rejected triggers + pub fn fallback_fragmented_high_selectivity_cache_rejected_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_high_selectivity_cache_rejected_count) + } + /// Increments the count of records read from the inner reader pub(crate) fn increment_inner_reads(&self, count: usize) { let Self::Enabled(inner) = self else { @@ -97,7 +232,7 @@ impl ArrowReaderMetrics { }; inner .records_read_from_inner - .fetch_add(count, std::sync::atomic::Ordering::Relaxed); + .fetch_add(count, Ordering::Relaxed); } /// Increments the count of records read from the cache @@ -108,7 +243,126 @@ impl ArrowReaderMetrics { inner .records_read_from_cache - .fetch_add(count, std::sync::atomic::Ordering::Relaxed); + .fetch_add(count, Ordering::Relaxed); + } + + pub(crate) fn record_row_selection(&self, decision: RowSelectionStrategyDecision) { + let Self::Enabled(inner) = self else { + return; + }; + + let shape = decision.shape; + inner + .row_selection_selected_rows + .fetch_add(shape.selected_rows, Ordering::Relaxed); + inner + .row_selection_skipped_rows + .fetch_add(shape.skipped_rows, Ordering::Relaxed); + inner + .row_selection_selector_count + .fetch_add(shape.selector_count, Ordering::Relaxed); + inner + .row_selection_selected_run_count + .fetch_add(shape.selected_run_count, Ordering::Relaxed); + inner + .row_selection_skipped_run_count + .fetch_add(shape.skipped_run_count, Ordering::Relaxed); + + let strategy_count = if decision.uses_mask() { + &inner.row_selection_mask_plan_count + } else { + &inner.row_selection_selector_plan_count + }; + strategy_count.fetch_add(1, Ordering::Relaxed); + + let decision_count = match decision.reason { + RowSelectionStrategyReason::ForcedMask => &inner.row_selection_forced_mask_plan_count, + RowSelectionStrategyReason::ForcedSelectors => { + &inner.row_selection_forced_selector_plan_count + } + RowSelectionStrategyReason::AutoMaskEmptySelection => { + &inner.row_selection_auto_mask_empty_plan_count + } + RowSelectionStrategyReason::AutoMaskShortRuns => { + &inner.row_selection_auto_mask_short_run_plan_count + } + RowSelectionStrategyReason::AutoMaskFragmentedSelection => { + &inner.row_selection_auto_mask_fragmented_plan_count + } + RowSelectionStrategyReason::AutoMaskHighSelectedRatio => { + &inner.row_selection_auto_mask_high_ratio_plan_count + } + RowSelectionStrategyReason::AutoSelectorClusteredSelection => { + &inner.row_selection_auto_selector_clustered_plan_count + } + RowSelectionStrategyReason::AutoSelectorLongRuns => { + &inner.row_selection_auto_selector_long_run_plan_count + } + }; + decision_count.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_fallback_observed_row_group(&self) { + let Self::Enabled(inner) = self else { + return; + }; + inner + .fallback_observed_row_group_count + .fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_fallback_row_group(&self, mode: RowGroupExecutionMode) { + let Self::Enabled(inner) = self else { + return; + }; + + let counter = match mode { + RowGroupExecutionMode::Pushdown(_) => &inner.fallback_pushdown_row_group_count, + RowGroupExecutionMode::PostFilter => &inner.fallback_post_filter_row_group_count, + }; + counter.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_fallback_trigger(&self, reason: FallbackTriggerReason) { + let Self::Enabled(inner) = self else { + return; + }; + + let counter = match reason { + FallbackTriggerReason::HighSelectivityNoPruning => { + &inner.fallback_high_selectivity_no_pruning_count + } + FallbackTriggerReason::FragmentedModerateSelectivity => { + &inner.fallback_fragmented_moderate_selectivity_count + } + FallbackTriggerReason::FragmentedHighSelectivityMaterialization => { + &inner.fallback_fragmented_high_selectivity_materialization_count + } + FallbackTriggerReason::FragmentedHighSelectivityOutputDominates => { + &inner.fallback_fragmented_high_selectivity_output_dominates_count + } + FallbackTriggerReason::FragmentedHighSelectivityCacheMiss => { + &inner.fallback_fragmented_high_selectivity_cache_miss_count + } + FallbackTriggerReason::FragmentedHighSelectivityCacheRejected => { + &inner.fallback_fragmented_high_selectivity_cache_rejected_count + } + FallbackTriggerReason::ObservationIncomplete => { + &inner.fallback_observation_incomplete_count + } + FallbackTriggerReason::PushdownStillPreferred => { + &inner.fallback_pushdown_still_preferred_count + } + FallbackTriggerReason::ForcedPolicy => &inner.fallback_forced_policy_count, + }; + counter.fetch_add(1, Ordering::Relaxed); + } + + fn load(&self, metric: fn(&ArrowReaderMetricsInner) -> &AtomicUsize) -> Option { + match self { + Self::Disabled => None, + Self::Enabled(inner) => Some(metric(inner).load(Ordering::Relaxed)), + } } } @@ -122,6 +376,60 @@ pub struct ArrowReaderMetricsInner { records_read_from_inner: AtomicUsize, /// Total number of records read from previously cached pages records_read_from_cache: AtomicUsize, + /// Total selected rows in planned row selections + row_selection_selected_rows: AtomicUsize, + /// Total skipped rows in planned row selections + row_selection_skipped_rows: AtomicUsize, + /// Total non-empty selectors in planned row selections + row_selection_selector_count: AtomicUsize, + /// Total selected runs in planned row selections + row_selection_selected_run_count: AtomicUsize, + /// Total skipped runs in planned row selections + row_selection_skipped_run_count: AtomicUsize, + /// Number of plans materialized with masks + row_selection_mask_plan_count: AtomicUsize, + /// Number of plans materialized with selectors + row_selection_selector_plan_count: AtomicUsize, + /// Number of plans forced to masks + row_selection_forced_mask_plan_count: AtomicUsize, + /// Number of plans forced to selectors + row_selection_forced_selector_plan_count: AtomicUsize, + /// Number of Auto plans choosing masks for empty selections + row_selection_auto_mask_empty_plan_count: AtomicUsize, + /// Number of Auto plans choosing masks for short runs + row_selection_auto_mask_short_run_plan_count: AtomicUsize, + /// Number of Auto plans using masks for fragmented selected rows + row_selection_auto_mask_fragmented_plan_count: AtomicUsize, + /// Number of Auto plans using masks for high selected-row ratio + row_selection_auto_mask_high_ratio_plan_count: AtomicUsize, + /// Number of Auto plans using selectors for clustered selected rows + row_selection_auto_selector_clustered_plan_count: AtomicUsize, + /// Number of Auto plans choosing selectors for long runs + row_selection_auto_selector_long_run_plan_count: AtomicUsize, + /// Number of row groups included in fallback observation + fallback_observed_row_group_count: AtomicUsize, + /// Number of fallback-capable row groups executed with pushdown + fallback_pushdown_row_group_count: AtomicUsize, + /// Number of row groups executed with post-filter + fallback_post_filter_row_group_count: AtomicUsize, + /// Number of fallback decisions disabled by forced policy + fallback_forced_policy_count: AtomicUsize, + /// Number of incomplete fallback observations + fallback_observation_incomplete_count: AtomicUsize, + /// Number of fallback decisions that kept pushdown + fallback_pushdown_still_preferred_count: AtomicUsize, + /// Number of high-selectivity no-pruning fallback triggers + fallback_high_selectivity_no_pruning_count: AtomicUsize, + /// Number of fragmented moderate-selectivity fallback triggers + fallback_fragmented_moderate_selectivity_count: AtomicUsize, + /// Number of fragmented high-selectivity materialization fallback triggers + fallback_fragmented_high_selectivity_materialization_count: AtomicUsize, + /// Number of fragmented high-selectivity output-dominates fallback triggers + fallback_fragmented_high_selectivity_output_dominates_count: AtomicUsize, + /// Number of fragmented high-selectivity cache-miss fallback triggers + fallback_fragmented_high_selectivity_cache_miss_count: AtomicUsize, + /// Number of fragmented high-selectivity cache-rejected fallback triggers + fallback_fragmented_high_selectivity_cache_rejected_count: AtomicUsize, } impl ArrowReaderMetricsInner { @@ -130,6 +438,33 @@ impl ArrowReaderMetricsInner { Self { records_read_from_inner: AtomicUsize::new(0), records_read_from_cache: AtomicUsize::new(0), + row_selection_selected_rows: AtomicUsize::new(0), + row_selection_skipped_rows: AtomicUsize::new(0), + row_selection_selector_count: AtomicUsize::new(0), + row_selection_selected_run_count: AtomicUsize::new(0), + row_selection_skipped_run_count: AtomicUsize::new(0), + row_selection_mask_plan_count: AtomicUsize::new(0), + row_selection_selector_plan_count: AtomicUsize::new(0), + row_selection_forced_mask_plan_count: AtomicUsize::new(0), + row_selection_forced_selector_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_empty_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_short_run_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_fragmented_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_high_ratio_plan_count: AtomicUsize::new(0), + row_selection_auto_selector_clustered_plan_count: AtomicUsize::new(0), + row_selection_auto_selector_long_run_plan_count: AtomicUsize::new(0), + fallback_observed_row_group_count: AtomicUsize::new(0), + fallback_pushdown_row_group_count: AtomicUsize::new(0), + fallback_post_filter_row_group_count: AtomicUsize::new(0), + fallback_forced_policy_count: AtomicUsize::new(0), + fallback_observation_incomplete_count: AtomicUsize::new(0), + fallback_pushdown_still_preferred_count: AtomicUsize::new(0), + fallback_high_selectivity_no_pruning_count: AtomicUsize::new(0), + fallback_fragmented_moderate_selectivity_count: AtomicUsize::new(0), + fallback_fragmented_high_selectivity_materialization_count: AtomicUsize::new(0), + fallback_fragmented_high_selectivity_output_dominates_count: AtomicUsize::new(0), + fallback_fragmented_high_selectivity_cache_miss_count: AtomicUsize::new(0), + fallback_fragmented_high_selectivity_cache_rejected_count: AtomicUsize::new(0), } } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 70d3ce7cf9a9..4b1a042a828a 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -18,13 +18,16 @@ //! Contains reader which reads parquet data into arrow [`RecordBatch`] use arrow_array::cast::AsArray; -use arrow_array::{Array, RecordBatch, RecordBatchReader}; +use arrow_array::{Array, BooleanArray, RecordBatch, RecordBatchReader}; +use arrow_buffer::BooleanBuffer; use arrow_schema::{ArrowError, DataType as ArrowType, FieldRef, Schema, SchemaRef}; +use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; pub use selection::{RowSelection, RowSelectionCursor, RowSelectionPolicy, RowSelector}; +use std::collections::VecDeque; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; pub use crate::arrow::array_reader::RowGroups; use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder}; @@ -1238,7 +1241,7 @@ impl ParquetRecordBatchReaderBuilder { .with_offset(offset) .with_limit(limit) .build_limited() - .build(); + .build_with_metrics(&metrics); Ok(ParquetRecordBatchReader::new(array_reader, read_plan)) } @@ -1340,8 +1343,196 @@ impl PageIterator for ReaderPageIterator {} /// [`Bytes`]: bytes::Bytes pub struct ParquetRecordBatchReader { array_reader: Box, + array_reader_position: usize, schema: SchemaRef, read_plan: ReadPlan, + post_filter: Option, + post_selection_filter: Option, + buffered_batches: Option>, +} + +#[derive(Debug)] +struct PostFilterState { + filter: Arc>, + predicate_projection_indices: Vec>, + predicate_projection_schemas: Vec, + output_projection_indices: Vec, + output_schema: SchemaRef, +} + +impl PostFilterState { + fn try_new( + filter: Arc>, + parquet_schema: &SchemaDescriptor, + read_schema: &Schema, + read_projection: &ProjectionMask, + output_projection: &ProjectionMask, + ) -> Result { + let filter_guard = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + let predicate_projection_indices = filter_guard + .predicates + .iter() + .map(|predicate| { + projection_indices(parquet_schema, read_projection, predicate.projection()) + }) + .collect::>>()?; + drop(filter_guard); + + let predicate_projection_schemas = predicate_projection_indices + .iter() + .map(|indices| read_schema.project(indices).map(SchemaRef::new)) + .collect::, _>>()?; + + let output_projection_indices = + projection_indices(parquet_schema, read_projection, output_projection)?; + let output_schema = SchemaRef::new(read_schema.project(&output_projection_indices)?); + + Ok(Self { + filter, + predicate_projection_indices, + predicate_projection_schemas, + output_projection_indices, + output_schema, + }) + } + + fn apply(&mut self, mut batch: RecordBatch) -> Result { + let mut filter = self.filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + for (predicate_idx, (predicate, projection_indices)) in filter + .predicates + .iter_mut() + .zip(self.predicate_projection_indices.iter()) + .enumerate() + { + let input_rows = batch.num_rows(); + let predicate_batch = project_record_batch( + &batch, + projection_indices, + Arc::clone(&self.predicate_projection_schemas[predicate_idx]), + )?; + let predicate_filter = predicate.evaluate(predicate_batch)?; + + if predicate_filter.len() != input_rows { + return Err(general_err!( + "ArrowPredicate predicate returned {} rows, expected {input_rows}", + predicate_filter.len() + )); + } + + batch = filter_record_batch(&batch, &predicate_filter)?; + if batch.num_rows() == 0 { + break; + } + } + + Ok(project_record_batch( + &batch, + &self.output_projection_indices, + Arc::clone(&self.output_schema), + )?) + } +} + +#[inline(always)] +fn project_record_batch( + batch: &RecordBatch, + indices: &[usize], + schema: SchemaRef, +) -> std::result::Result { + if indices.len() == batch.num_columns() && indices.iter().copied().eq(0..batch.num_columns()) { + debug_assert_eq!(batch.schema_ref().as_ref(), schema.as_ref()); + return Ok(batch.clone()); + } + + let columns = indices + .iter() + .map(|idx| { + batch.columns().get(*idx).cloned().ok_or_else(|| { + ArrowError::SchemaError(format!( + "project index {} out of bounds, max field {}", + idx, + batch.num_columns() + )) + }) + }) + .collect::, ArrowError>>()?; + + unsafe { + // The indices and schema are produced from the same valid read schema + // at construction time, and filtering preserves column lengths. + Ok(RecordBatch::new_unchecked( + schema, + columns, + batch.num_rows(), + )) + } +} + +#[derive(Debug)] +struct PostSelectionFilterState { + mask: BooleanBuffer, + position: usize, +} + +impl PostSelectionFilterState { + fn new(selection: RowSelection) -> Self { + Self { + mask: selection.boolean_mask(), + position: 0, + } + } + + fn apply(&mut self, batch: RecordBatch) -> Result { + let input_rows = batch.num_rows(); + let end = self.position.saturating_add(input_rows); + if end > self.mask.len() { + return Err(general_err!( + "post-selection filter exceeded selection length: end {end}, selection length {}", + self.mask.len() + )); + } + + let filter = BooleanArray::from(self.mask.slice(self.position, input_rows)); + self.position = end; + Ok(filter_record_batch(&batch, &filter)?) + } +} + +fn projection_indices( + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + target_projection: &ProjectionMask, +) -> Result> { + let mut indices = Vec::new(); + let mut read_idx = 0; + + for leaf_idx in 0..parquet_schema.num_columns() { + if read_projection.leaf_included(leaf_idx) { + let root = parquet_schema.get_column_root(leaf_idx); + if !root.is_primitive() { + return Err(general_err!( + "post-filter fallback does not support nested read column {}", + root.name() + )); + } + if target_projection.leaf_included(leaf_idx) { + indices.push(read_idx); + } + read_idx += 1; + } else if target_projection.leaf_included(leaf_idx) { + return Err(general_err!( + "post-filter target projection includes leaf column {leaf_idx} not present in read projection" + )); + } + } + + Ok(indices) } impl Debug for ParquetRecordBatchReader { @@ -1350,6 +1541,12 @@ impl Debug for ParquetRecordBatchReader { .field("array_reader", &"...") .field("schema", &self.schema) .field("read_plan", &self.read_plan) + .field("post_filter", &self.post_filter) + .field("post_selection_filter", &self.post_selection_filter) + .field( + "buffered_batches", + &self.buffered_batches.as_ref().map(|b| b.len()), + ) .finish() } } @@ -1371,6 +1568,33 @@ impl ParquetRecordBatchReader { /// Returns `Result>` rather than `Option>` to /// simplify error handling with `?` fn next_inner(&mut self) -> Result> { + if let Some(buffered_batches) = self.buffered_batches.as_mut() { + return Ok(buffered_batches.pop_front()); + } + + loop { + let Some(batch) = self.next_inner_decoded()? else { + return Ok(None); + }; + + let batch = match self.post_filter.as_mut() { + Some(post_filter) => post_filter.apply(batch)?, + None => batch, + }; + let batch = match self.post_selection_filter.as_mut() { + Some(post_selection_filter) => post_selection_filter.apply(batch)?, + None => batch, + }; + + if batch.num_rows() == 0 { + continue; + } + + return Ok(Some(batch)); + } + } + + fn next_inner_decoded(&mut self) -> Result> { let mut read_records = 0; let batch_size = self.batch_size(); if batch_size == 0 { @@ -1378,6 +1602,102 @@ impl ParquetRecordBatchReader { } match self.read_plan.row_selection_cursor_mut() { RowSelectionCursor::Mask(mask_cursor) => { + if mask_cursor.is_sparse() { + let sparse_cursor = mask_cursor.sparse_mut().unwrap(); + + while !sparse_cursor.is_empty() { + let Some(mask_chunk) = sparse_cursor.next_sparse_mask_chunk(batch_size)? + else { + return Ok(None); + }; + let mut filtered_batches = Vec::new(); + + for segment in mask_chunk.segments { + if segment.row_range.start < self.array_reader_position { + return Err(general_err!( + "sparse mask segment starts before current reader position - segment start {}, current position {}", + segment.row_range.start, + self.array_reader_position + )); + } + + if segment.row_range.start > self.array_reader_position { + let to_skip = segment.row_range.start - self.array_reader_position; + let skipped = self.array_reader.skip_records(to_skip)?; + if skipped != to_skip { + return Err(general_err!( + "failed to skip rows, expected {}, got {}", + to_skip, + skipped + )); + } + self.array_reader_position += skipped; + } + + let to_read = segment.row_range.len(); + if to_read == 0 { + continue; + } + + let read = self.array_reader.read_records(to_read)?; + if read == 0 { + return Err(general_err!( + "reached end of column while expecting {} rows", + to_read + )); + } + if read != to_read { + return Err(general_err!( + "insufficient rows read from array reader - expected {}, got {}", + to_read, + read + )); + } + self.array_reader_position += read; + + let mask = sparse_cursor.mask_values_for(&segment)?; + let selected_rows = mask.true_count(); + + let array = self.array_reader.consume_batch()?; + // The column reader exposes the projection as a struct array; convert this + // into a record batch before applying the boolean filter mask. + let struct_array = array.as_struct_opt().ok_or_else(|| { + ArrowError::ParquetError( + "Struct array reader should return struct array".to_string(), + ) + })?; + + let filtered_batch = + filter_record_batch(&RecordBatch::from(struct_array), &mask)?; + + if filtered_batch.num_rows() != selected_rows { + return Err(general_err!( + "filtered rows mismatch selection - expected {}, got {}", + selected_rows, + filtered_batch.num_rows() + )); + } + + if filtered_batch.num_rows() == 0 { + continue; + } + + filtered_batches.push(filtered_batch); + } + + match filtered_batches.len() { + 0 => continue, + 1 => return Ok(filtered_batches.pop()), + _ => { + let schema = filtered_batches[0].schema(); + return Ok(Some(concat_batches(&schema, &filtered_batches)?)); + } + } + } + + return Ok(None); + } + // Stream the record batch reader using contiguous segments of the selection // mask, avoiding the need to materialize intermediate `RowSelector` ranges. while !mask_cursor.is_empty() { @@ -1394,6 +1714,7 @@ impl ParquetRecordBatchReader { skipped )); } + self.array_reader_position += skipped; } if mask_chunk.chunk_rows == 0 { @@ -1419,6 +1740,7 @@ impl ParquetRecordBatchReader { read )); } + self.array_reader_position += read; let array = self.array_reader.consume_batch()?; // The column reader exposes the projection as a struct array; convert this @@ -1460,6 +1782,7 @@ impl ParquetRecordBatchReader { skipped )); } + self.array_reader_position += skipped; continue; } @@ -1482,12 +1805,16 @@ impl ParquetRecordBatchReader { }; match self.array_reader.read_records(to_read)? { 0 => break, - rec => read_records += rec, + rec => { + read_records += rec; + self.array_reader_position += rec; + } }; } } RowSelectionCursor::All => { - self.array_reader.read_records(batch_size)?; + let read = self.array_reader.read_records(batch_size)?; + self.array_reader_position += read; } }; @@ -1502,6 +1829,21 @@ impl ParquetRecordBatchReader { None }) } + + pub(crate) fn materialize_post_filter(&mut self) -> Result<()> { + if self.post_filter.is_none() || self.buffered_batches.is_some() { + return Ok(()); + } + + let mut buffered_batches = VecDeque::new(); + while let Some(batch) = self.next_inner()? { + buffered_batches.push_back(batch); + } + self.post_filter = None; + self.buffered_batches = Some(buffered_batches); + + Ok(()) + } } impl RecordBatchReader for ParquetRecordBatchReader { @@ -1547,8 +1889,12 @@ impl ParquetRecordBatchReader { Ok(Self { array_reader, + array_reader_position: 0, schema: Arc::new(Schema::new(levels.fields.clone())), read_plan, + post_filter: None, + post_selection_filter: None, + buffered_batches: None, }) } @@ -1563,11 +1909,68 @@ impl ParquetRecordBatchReader { Self { array_reader, + array_reader_position: 0, + schema: Arc::new(schema), + read_plan, + post_filter: None, + post_selection_filter: None, + buffered_batches: None, + } + } + + pub(crate) fn new_post_selection_filter( + array_reader: Box, + read_plan: ReadPlan, + selection: RowSelection, + ) -> Self { + let schema = match array_reader.get_data_type() { + ArrowType::Struct(fields) => Schema::new(fields.clone()), + _ => unreachable!("Struct array reader's data type is not struct!"), + }; + + Self { + array_reader, + array_reader_position: 0, schema: Arc::new(schema), read_plan, + post_filter: None, + post_selection_filter: Some(PostSelectionFilterState::new(selection)), + buffered_batches: None, } } + pub(crate) fn new_post_filter( + array_reader: Box, + read_plan: ReadPlan, + filter: Arc>, + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + output_projection: &ProjectionMask, + ) -> Result { + let read_schema = match array_reader.get_data_type() { + ArrowType::Struct(fields) => Schema::new(fields.clone()), + _ => unreachable!("Struct array reader's data type is not struct!"), + }; + let post_filter = PostFilterState::try_new( + filter, + parquet_schema, + &read_schema, + read_projection, + output_projection, + )?; + let schema = Arc::clone(&post_filter.output_schema); + + Ok(Self { + array_reader, + array_reader_position: 0, + schema, + read_plan, + post_filter: Some(post_filter), + post_selection_filter: None, + buffered_batches: None, + }) + } + #[inline(always)] pub(crate) fn batch_size(&self) -> usize { self.read_plan.batch_size() @@ -1588,6 +1991,7 @@ pub(crate) mod tests { use rand::{Rng, RngCore, SeedableRng, random, rng}; use tempfile::tempfile; + use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector, @@ -1638,6 +2042,43 @@ pub(crate) mod tests { assert_eq!(original_schema.fields(), reader.schema().fields()); } + #[test] + fn sync_reader_records_row_selection_metrics_after_limit_offset() { + let schema = Arc::new(Schema::new(vec![Field::new( + "c0", + ArrowDataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from_iter_values(0..10)) as ArrayRef], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let metrics = ArrowReaderMetrics::enabled(); + let _reader = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)) + .unwrap() + .with_metrics(metrics.clone()) + .with_offset(3) + .with_limit(4) + .build() + .unwrap(); + + assert_eq!(metrics.row_selection_selected_rows(), Some(4)); + assert_eq!(metrics.row_selection_skipped_rows(), Some(3)); + assert_eq!(metrics.row_selection_selector_count(), Some(2)); + assert_eq!(metrics.row_selection_mask_plan_count(), Some(1)); + assert_eq!( + metrics.row_selection_auto_mask_short_run_plan_count(), + Some(1) + ); + } + #[test] fn test_reuse_schema() { let file = get_test_file("parquet/alltypes-java.parquet"); diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index ac2e105ecf4f..dba336fe0eed 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -19,8 +19,11 @@ //! from a Parquet file use crate::arrow::array_reader::ArrayReader; -use crate::arrow::arrow_reader::selection::RowSelectionPolicy; -use crate::arrow::arrow_reader::selection::RowSelectionStrategy; +use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; +use crate::arrow::arrow_reader::selection::{ + LoadedRowRanges, RowSelectionPolicy, RowSelectionShape, RowSelectionStrategy, + RowSelectionStrategyDecision, RowSelectionStrategyReason, +}; use crate::arrow::arrow_reader::{ ArrowPredicate, ParquetRecordBatchReader, RowSelection, RowSelectionCursor, RowSelector, }; @@ -30,6 +33,12 @@ use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; use arrow_select::filter::prep_null_mask_filter; use std::collections::VecDeque; +const HIGH_SELECTED_RATIO_NUMERATOR: usize = 7; +const HIGH_SELECTED_RATIO_DENOMINATOR: usize = 8; +const FRAGMENTED_SELECTED_RUN_LIMIT: usize = 4; +const CLUSTERED_SELECTED_RUN_MULTIPLIER: usize = 4; +const CLUSTERED_SKIPPED_RUN_MULTIPLIER: usize = 4; + /// Options for [`ReadPlanBuilder::with_predicate_options`]. pub struct PredicateOptions<'a> { array_reader: Box, @@ -84,6 +93,8 @@ pub struct ReadPlanBuilder { selection: Option, /// Policy to use when materializing the row selection row_selection_policy: RowSelectionPolicy, + /// Row ranges already loaded by page pruning + loaded_row_ranges: Option, } impl ReadPlanBuilder { @@ -93,6 +104,7 @@ impl ReadPlanBuilder { batch_size, selection: None, row_selection_policy: RowSelectionPolicy::default(), + loaded_row_ranges: None, } } @@ -110,6 +122,11 @@ impl ReadPlanBuilder { self } + pub(crate) fn with_loaded_row_ranges(mut self, loaded: Option) -> Self { + self.loaded_row_ranges = loaded; + self + } + /// Returns the current row selection policy pub fn row_selection_policy(&self) -> &RowSelectionPolicy { &self.row_selection_policy @@ -147,36 +164,35 @@ impl ReadPlanBuilder { /// Returns the [`RowSelectionStrategy`] for this plan. /// /// Guarantees to return either `Selectors` or `Mask`, never `Auto`. + #[cfg(test)] pub(crate) fn resolve_selection_strategy(&self) -> RowSelectionStrategy { + self.resolve_selection_strategy_decision().strategy + } + + pub(crate) fn resolve_selection_strategy_decision(&self) -> RowSelectionStrategyDecision { + let shape = RowSelectionShape::from_selection(self.selection.as_ref()); + match self.row_selection_policy { - RowSelectionPolicy::Selectors => RowSelectionStrategy::Selectors, - RowSelectionPolicy::Mask => RowSelectionStrategy::Mask, + RowSelectionPolicy::Selectors => RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::ForcedSelectors, + shape, + ), + RowSelectionPolicy::Mask => RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::ForcedMask, + shape, + ), RowSelectionPolicy::Auto { threshold, .. } => { - let selection = match self.selection.as_ref() { - Some(selection) => selection, - None => return RowSelectionStrategy::Selectors, - }; - - // total_rows: total number of rows selected / skipped - // effective_count: number of non-empty selectors - let (total_rows, effective_count) = - selection.iter().fold((0usize, 0usize), |(rows, count), s| { - if s.row_count > 0 { - (rows + s.row_count, count + 1) - } else { - (rows, count) - } - }); - - if effective_count == 0 { - return RowSelectionStrategy::Mask; + if self.selection.is_none() { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + shape, + ); } - if total_rows < effective_count.saturating_mul(threshold) { - RowSelectionStrategy::Mask - } else { - RowSelectionStrategy::Selectors - } + resolve_auto_selection_strategy(threshold, shape) } } } @@ -293,25 +309,43 @@ impl ReadPlanBuilder { self.selection = Some(RowSelection::from(vec![])); } + self.build_with_metrics(&ArrowReaderMetrics::disabled()) + } + + /// Create a final `ReadPlan` and record row-selection planning metrics. + pub(crate) fn build_with_metrics(mut self, metrics: &ArrowReaderMetrics) -> ReadPlan { + // If selection is empty, truncate + if !self.selects_any() { + self.selection = Some(RowSelection::from(vec![])); + } + // Preferred strategy must not be Auto - let selection_strategy = self.resolve_selection_strategy(); + let selection_strategy_decision = self.resolve_selection_strategy_decision(); + let selection_strategy = selection_strategy_decision.strategy; let Self { batch_size, selection, row_selection_policy: _, + loaded_row_ranges, } = self; let selection = selection.map(|s| s.trim()); + if matches!(metrics, ArrowReaderMetrics::Enabled(_)) && selection.is_some() { + let shape = RowSelectionShape::from_selection(selection.as_ref()); + metrics.record_row_selection(selection_strategy_decision.with_shape(shape)); + } let row_selection_cursor = selection .map(|s| { - let trimmed = s.trim(); - let selectors: Vec = trimmed.into(); + let selectors: Vec = s.into(); match selection_strategy { - RowSelectionStrategy::Mask => { - RowSelectionCursor::new_mask_from_selectors(selectors) - } + RowSelectionStrategy::Mask => match loaded_row_ranges { + Some(loaded) => { + RowSelectionCursor::new_sparse_mask_from_selectors(selectors, loaded) + } + None => RowSelectionCursor::new_mask_from_selectors(selectors), + }, RowSelectionStrategy::Selectors => RowSelectionCursor::new_selectors(selectors), } }) @@ -324,6 +358,111 @@ impl ReadPlanBuilder { } } +fn resolve_auto_selection_strategy( + threshold: usize, + shape: RowSelectionShape, +) -> RowSelectionStrategyDecision { + if shape.selector_count == 0 || shape.selected_rows == 0 { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskEmptySelection, + shape, + ); + } + + if clustered_selection_at_or_above_threshold(shape, threshold) { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorClusteredSelection, + shape, + ); + } + + if shape.skipped_rows > 0 + && selected_ratio_at_least( + shape, + HIGH_SELECTED_RATIO_NUMERATOR, + HIGH_SELECTED_RATIO_DENOMINATOR, + ) + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskHighSelectedRatio, + shape, + ); + } + + if shape.selected_run_count > 1 + && shape.average_selected_run_length() <= FRAGMENTED_SELECTED_RUN_LIMIT as f64 + && selection_density_at_or_above_threshold(shape, threshold) + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskFragmentedSelection, + shape, + ); + } + + if shape.selected_run_count > 0 + && shape.average_selected_run_length() + >= threshold.saturating_mul(CLUSTERED_SELECTED_RUN_MULTIPLIER) as f64 + && shape.average_skipped_run_length() > 0.0 + && shape.selected_ratio() <= 0.5 + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorClusteredSelection, + shape, + ); + } + + if shape.total_rows() < shape.selector_count.saturating_mul(threshold) { + RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskShortRuns, + shape, + ) + } else { + RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + shape, + ) + } +} + +fn selected_ratio_at_least(shape: RowSelectionShape, numerator: usize, denominator: usize) -> bool { + (shape.selected_rows as u128) * (denominator as u128) + >= (shape.total_rows() as u128) * (numerator as u128) +} + +fn selection_density_at_or_above_threshold(shape: RowSelectionShape, threshold: usize) -> bool { + (shape.total_rows() as u128) <= (shape.selector_count as u128) * (threshold as u128) +} + +fn clustered_selection_at_or_above_threshold(shape: RowSelectionShape, threshold: usize) -> bool { + average_run_length_at_least( + shape.selected_rows, + shape.selected_run_count, + threshold, + CLUSTERED_SELECTED_RUN_MULTIPLIER, + ) && average_run_length_at_least( + shape.skipped_rows, + shape.skipped_run_count, + threshold, + CLUSTERED_SKIPPED_RUN_MULTIPLIER, + ) +} + +fn average_run_length_at_least( + rows: usize, + runs: usize, + threshold: usize, + multiplier: usize, +) -> bool { + runs > 0 && (rows as u128) >= (runs as u128) * (threshold as u128) * (multiplier as u128) +} + /// Builder for [`ReadPlan`] that applies a limit and offset to the read plan /// /// See [`ReadPlanBuilder::limited`] to create this builder. @@ -480,6 +619,346 @@ mod tests { ReadPlanBuilder::new(1024).with_selection(Some(selection)) } + fn assert_strategy_decision( + builder: ReadPlanBuilder, + strategy: RowSelectionStrategy, + reason: RowSelectionStrategyReason, + selected_rows: usize, + skipped_rows: usize, + selector_count: usize, + selected_run_count: usize, + skipped_run_count: usize, + ) { + let decision = builder.resolve_selection_strategy_decision(); + assert_eq!(decision.strategy, strategy); + assert_eq!(decision.reason, reason); + assert_eq!(decision.shape.selected_rows, selected_rows); + assert_eq!(decision.shape.skipped_rows, skipped_rows); + assert_eq!(decision.shape.selector_count, selector_count); + assert_eq!(decision.shape.selected_run_count, selected_run_count); + assert_eq!(decision.shape.skipped_run_count, skipped_run_count); + } + + #[test] + fn row_group_execution_modes_cover_pushdown_and_post_filter() { + use crate::arrow::arrow_reader::selection::{RowGroupExecutionMode, RowSelectionStrategy}; + + assert_eq!( + RowGroupExecutionMode::Pushdown(RowSelectionStrategy::Mask).to_string(), + "Pushdown(Mask)" + ); + assert_eq!( + RowGroupExecutionMode::Pushdown(RowSelectionStrategy::Selectors).to_string(), + "Pushdown(Selectors)" + ); + assert_eq!(RowGroupExecutionMode::PostFilter.to_string(), "PostFilter"); + } + + #[test] + fn fallback_classifier_triggers_for_fragmented_high_selectivity() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 128, + skipped_rows: 64, + selector_count: 96, + selected_run_count: 64, + skipped_run_count: 32, + }, + predicate_evaluate_nanos: 10, + output_read_nanos: 20, + output_materialize_nanos: 50, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::FragmentedHighSelectivityMaterialization + ); + } + + #[test] + fn fallback_classifier_waits_for_observation_window() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 0, + shape: RowSelectionShape { + selected_rows: 64, + skipped_rows: 64, + selector_count: 64, + selected_run_count: 32, + skipped_run_count: 32, + }, + predicate_evaluate_nanos: 10, + output_read_nanos: 20, + output_materialize_nanos: 50, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::ObservationIncomplete + ); + } + + #[test] + fn fallback_classifier_triggers_for_high_selectivity_without_pruning() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 200, + skipped_rows: 0, + selector_count: 2, + selected_run_count: 2, + skipped_run_count: 0, + }, + predicate_evaluate_nanos: 0, + output_read_nanos: 0, + output_materialize_nanos: 0, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::HighSelectivityNoPruning + ); + } + + #[test] + fn fallback_classifier_triggers_for_fragmented_moderate_selectivity() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 30, + skipped_rows: 170, + selector_count: 60, + selected_run_count: 30, + skipped_run_count: 30, + }, + predicate_evaluate_nanos: 0, + output_read_nanos: 0, + output_materialize_nanos: 0, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::FragmentedModerateSelectivity + ); + } + + #[test] + fn fallback_classifier_triggers_for_fragmented_near_ten_percent_selectivity() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 9, + skipped_rows: 91, + selector_count: 18, + selected_run_count: 9, + skipped_run_count: 9, + }, + predicate_evaluate_nanos: 0, + output_read_nanos: 0, + output_materialize_nanos: 0, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::FragmentedModerateSelectivity + ); + } + + #[test] + fn fallback_classifier_keeps_q38_like_low_selectivity_fragmented_pushdown() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 4_870, + skipped_rows: 57_698, + selector_count: 6_168, + selected_run_count: 3_084, + skipped_run_count: 3_084, + }, + predicate_evaluate_nanos: 0, + output_read_nanos: 0, + output_materialize_nanos: 0, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::PushdownStillPreferred + ); + } + + #[test] + fn fallback_classifier_keeps_low_selectivity_fragmented_pushdown() { + use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, + }; + + let observation = FallbackObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 4, + skipped_rows: 196, + selector_count: 8, + selected_run_count: 4, + skipped_run_count: 4, + }, + predicate_evaluate_nanos: 10, + output_read_nanos: 20, + output_materialize_nanos: 50, + cache_miss_count: 0, + cache_insert_rejected_count: 0, + }; + + assert_eq!( + observation.trigger_reason(), + FallbackTriggerReason::PushdownStillPreferred + ); + } + + #[test] + fn selection_strategy_decision_records_forced_mask() { + let selection = RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(8)]); + let builder = + builder_with_selection(selection).with_row_selection_policy(RowSelectionPolicy::Mask); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::ForcedMask, + 8, + 2, + 2, + 1, + 1, + ); + } + + #[test] + fn selection_strategy_decision_records_forced_selectors() { + let selection = RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(8)]); + let builder = builder_with_selection(selection) + .with_row_selection_policy(RowSelectionPolicy::Selectors); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::ForcedSelectors, + 8, + 2, + 2, + 1, + 1, + ); + } + + #[test] + fn selection_strategy_decision_records_auto_empty_selection() { + let selection = RowSelection::from(vec![]); + let builder = builder_with_selection(selection); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskEmptySelection, + 0, + 0, + 0, + 0, + 0, + ); + } + + #[test] + fn selection_strategy_decision_records_auto_short_runs() { + let selection = RowSelection::from(vec![RowSelector::select(8), RowSelector::skip(8)]); + let builder = builder_with_selection(selection); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskShortRuns, + 8, + 8, + 2, + 1, + 1, + ); + } + + #[test] + fn selection_strategy_decision_records_auto_long_runs() { + let selection = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(3)]); + let builder = builder_with_selection(selection) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + 3, + 3, + 2, + 1, + 1, + ); + } + + #[test] + fn build_metrics_records_structured_strategy_decision_shape() { + let metrics = ArrowReaderMetrics::enabled(); + let selection = RowSelection::from(vec![RowSelector::select(8), RowSelector::skip(4)]); + let builder = builder_with_selection(selection); + + builder.build_with_metrics(&metrics); + + assert_eq!(metrics.row_selection_selected_rows(), Some(8)); + assert_eq!(metrics.row_selection_skipped_rows(), Some(0)); + assert_eq!(metrics.row_selection_selector_count(), Some(1)); + assert_eq!(metrics.row_selection_selected_run_count(), Some(1)); + assert_eq!(metrics.row_selection_skipped_run_count(), Some(0)); + assert_eq!(metrics.row_selection_mask_plan_count(), Some(1)); + assert_eq!(metrics.row_selection_selector_plan_count(), Some(0)); + assert_eq!( + metrics.row_selection_auto_mask_short_run_plan_count(), + Some(1) + ); + } + #[test] fn preferred_selection_strategy_prefers_mask_by_default() { let selection = RowSelection::from(vec![RowSelector::select(8)]); @@ -492,7 +971,7 @@ mod tests { #[test] fn preferred_selection_strategy_prefers_selectors_when_threshold_small() { - let selection = RowSelection::from(vec![RowSelector::select(8)]); + let selection = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(3)]); let builder = builder_with_selection(selection) .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); assert_eq!( @@ -501,6 +980,160 @@ mod tests { ); } + #[test] + fn auto_strategy_prefers_mask_for_fragmented_selected_rows_at_threshold_boundary() { + let selectors: Vec = (0..64) + .flat_map(|_| [RowSelector::select(1), RowSelector::skip(63)]) + .collect(); + let builder = builder_with_selection(RowSelection::from(selectors)); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Mask); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoMaskFragmentedSelection + ); + assert_eq!(decision.shape.selected_run_count, 64); + assert_eq!(decision.shape.average_selected_run_length(), 1.0); + } + + #[test] + fn auto_strategy_prefers_mask_for_high_selected_ratio() { + let selection = RowSelection::from(vec![ + RowSelector::select(900), + RowSelector::skip(25), + RowSelector::select(75), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Mask); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoMaskHighSelectedRatio + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_clustered_high_selected_ratio() { + let selectors: Vec = (0..10) + .flat_map(|_| [RowSelector::select(9000), RowSelector::skip(1000)]) + .collect(); + let selection = RowSelection::from(selectors); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_clustered_long_selected_runs() { + let selection = + RowSelection::from(vec![RowSelector::skip(9000), RowSelector::select(1000)]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_long_single_selected_run_with_no_skips() { + let selection = RowSelection::from(vec![RowSelector::select(1024)]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorLongRuns + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_tiny_runs_separated_by_huge_skip() { + let selection = RowSelection::from(vec![ + RowSelector::select(4), + RowSelector::skip(100_000), + RowSelector::select(4), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorLongRuns + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_huge_half_selected_ratio_without_saturation() { + let selection = RowSelection::from(vec![ + RowSelector::select(usize::MAX / 2), + RowSelector::skip(usize::MAX / 2), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn build_metrics_records_shape_aware_strategy_reasons() { + let metrics = ArrowReaderMetrics::enabled(); + let fragmented_selectors: Vec = (0..64) + .flat_map(|_| [RowSelector::select(1), RowSelector::skip(63)]) + .collect(); + + builder_with_selection(RowSelection::from(fragmented_selectors)) + .build_with_metrics(&metrics); + builder_with_selection(RowSelection::from(vec![ + RowSelector::select(900), + RowSelector::skip(25), + RowSelector::select(75), + ])) + .build_with_metrics(&metrics); + builder_with_selection(RowSelection::from(vec![ + RowSelector::skip(9000), + RowSelector::select(1000), + ])) + .build_with_metrics(&metrics); + + assert_eq!(metrics.row_selection_mask_plan_count(), Some(2)); + assert_eq!(metrics.row_selection_selector_plan_count(), Some(1)); + assert_eq!( + metrics.row_selection_auto_mask_fragmented_plan_count(), + Some(1) + ); + assert_eq!( + metrics.row_selection_auto_mask_high_ratio_plan_count(), + Some(1) + ); + assert_eq!( + metrics.row_selection_auto_selector_clustered_plan_count(), + Some(1) + ); + } + #[test] fn truncate_filter_after_n_trues_keeps_first_n_matches() { let f = BooleanArray::from(vec![true, false, true, true, false, true, true]); diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 2ddf812f9c39..db4dc55c25d9 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::ProjectionMask; use crate::errors::ParquetError; -use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; +use crate::file::page_index::offset_index::PageLocation; use arrow_array::{Array, BooleanArray}; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; use arrow_select::filter::SlicesIterator; @@ -60,6 +59,242 @@ pub(crate) enum RowSelectionStrategy { Mask, } +#[allow(dead_code)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowGroupExecutionMode { + Pushdown(RowSelectionStrategy), + PostFilter, +} + +impl std::fmt::Display for RowGroupExecutionMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pushdown(RowSelectionStrategy::Mask) => f.write_str("Pushdown(Mask)"), + Self::Pushdown(RowSelectionStrategy::Selectors) => f.write_str("Pushdown(Selectors)"), + Self::PostFilter => f.write_str("PostFilter"), + } + } +} + +/// Why a final row-selection read plan used masks or selectors. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowSelectionStrategyReason { + /// The caller explicitly requested masks. + ForcedMask, + /// The caller explicitly requested selectors. + ForcedSelectors, + /// Auto chose masks because the selection has no non-empty selectors. + AutoMaskEmptySelection, + /// Auto chose masks because average selector length is below the threshold. + AutoMaskShortRuns, + /// Auto chose masks because selected rows are fragmented into many short runs. + AutoMaskFragmentedSelection, + /// Auto chose masks because most rows are selected and selector skipping is unlikely to pay off. + AutoMaskHighSelectedRatio, + /// Auto chose selectors because selected rows are clustered into long runs. + AutoSelectorClusteredSelection, + /// Auto chose selectors because average selector length reaches the threshold. + AutoSelectorLongRuns, +} + +/// Shape summary for a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct RowSelectionShape { + pub(crate) selected_rows: usize, + pub(crate) skipped_rows: usize, + pub(crate) selector_count: usize, + pub(crate) selected_run_count: usize, + pub(crate) skipped_run_count: usize, +} + +impl RowSelectionShape { + pub(crate) fn from_selection(selection: Option<&RowSelection>) -> Self { + let Some(selection) = selection else { + return Self::default(); + }; + + selection + .iter() + .fold(Self::default(), |mut shape, selector| { + if selector.row_count == 0 { + return shape; + } + + shape.selector_count += 1; + if selector.skip { + shape.skipped_rows += selector.row_count; + shape.skipped_run_count += 1; + } else { + shape.selected_rows += selector.row_count; + shape.selected_run_count += 1; + } + shape + }) + } + + pub(crate) fn total_rows(self) -> usize { + self.selected_rows + self.skipped_rows + } + + pub(crate) fn selected_ratio(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selected_rows as f64 / total as f64 + } + } + + #[allow(dead_code)] + pub(crate) fn run_density(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selector_count as f64 / total as f64 + } + } + + pub(crate) fn average_selected_run_length(self) -> f64 { + average_run_length(self.selected_rows, self.selected_run_count) + } + + pub(crate) fn average_skipped_run_length(self) -> f64 { + average_run_length(self.skipped_rows, self.skipped_run_count) + } + + pub(crate) fn add_assign(&mut self, other: Self) { + self.selected_rows += other.selected_rows; + self.skipped_rows += other.skipped_rows; + self.selector_count += other.selector_count; + self.selected_run_count += other.selected_run_count; + self.skipped_run_count += other.skipped_run_count; + } +} + +#[allow(dead_code)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum FallbackTriggerReason { + HighSelectivityNoPruning, + FragmentedModerateSelectivity, + FragmentedHighSelectivityMaterialization, + FragmentedHighSelectivityOutputDominates, + FragmentedHighSelectivityCacheMiss, + FragmentedHighSelectivityCacheRejected, + ObservationIncomplete, + PushdownStillPreferred, + ForcedPolicy, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct FallbackObservation { + pub(crate) observed_row_groups: usize, + pub(crate) shape: RowSelectionShape, + pub(crate) predicate_evaluate_nanos: usize, + pub(crate) output_read_nanos: usize, + pub(crate) output_materialize_nanos: usize, + pub(crate) cache_miss_count: usize, + pub(crate) cache_insert_rejected_count: usize, +} + +impl FallbackObservation { + pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; + const FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; + + pub(crate) fn trigger_reason(self) -> FallbackTriggerReason { + if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { + return FallbackTriggerReason::ObservationIncomplete; + } + + let shape = self.shape; + if shape.total_rows() > 0 && shape.skipped_rows == 0 && shape.selected_ratio() >= 0.95 { + return FallbackTriggerReason::HighSelectivityNoPruning; + } + + let fragmented = shape.average_selected_run_length() <= 4.0 && shape.run_density() >= 0.01; + + if !fragmented { + return FallbackTriggerReason::PushdownStillPreferred; + } + + let selected_ratio = shape.selected_ratio(); + if selected_ratio >= Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO + && selected_ratio < 0.50 + { + return FallbackTriggerReason::FragmentedModerateSelectivity; + } + if selected_ratio < 0.50 { + return FallbackTriggerReason::PushdownStillPreferred; + } + + if self.output_materialize_nanos >= self.predicate_evaluate_nanos { + return FallbackTriggerReason::FragmentedHighSelectivityMaterialization; + } + if self.output_materialize_nanos > self.output_read_nanos { + return FallbackTriggerReason::FragmentedHighSelectivityOutputDominates; + } + if self.cache_miss_count > 0 { + return FallbackTriggerReason::FragmentedHighSelectivityCacheMiss; + } + if self.cache_insert_rejected_count > 0 { + return FallbackTriggerReason::FragmentedHighSelectivityCacheRejected; + } + + FallbackTriggerReason::PushdownStillPreferred + } + + #[allow(dead_code)] + pub(crate) fn should_fallback(self) -> bool { + matches!( + self.trigger_reason(), + FallbackTriggerReason::HighSelectivityNoPruning + | FallbackTriggerReason::FragmentedModerateSelectivity + | FallbackTriggerReason::FragmentedHighSelectivityMaterialization + | FallbackTriggerReason::FragmentedHighSelectivityOutputDominates + | FallbackTriggerReason::FragmentedHighSelectivityCacheMiss + | FallbackTriggerReason::FragmentedHighSelectivityCacheRejected + ) + } +} + +/// Fully resolved decision for materializing a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct RowSelectionStrategyDecision { + pub(crate) strategy: RowSelectionStrategy, + pub(crate) reason: RowSelectionStrategyReason, + pub(crate) shape: RowSelectionShape, +} + +impl RowSelectionStrategyDecision { + pub(crate) fn new( + strategy: RowSelectionStrategy, + reason: RowSelectionStrategyReason, + shape: RowSelectionShape, + ) -> Self { + Self { + strategy, + reason, + shape, + } + } + + pub(crate) fn with_shape(self, shape: RowSelectionShape) -> Self { + Self { shape, ..self } + } + + pub(crate) fn uses_mask(self) -> bool { + matches!(self.strategy, RowSelectionStrategy::Mask) + } +} + +fn average_run_length(rows: usize, runs: usize) -> f64 { + if runs == 0 { + 0.0 + } else { + rows as f64 / runs as f64 + } +} + /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when /// scanning a parquet file #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -250,37 +485,33 @@ impl RowSelection { ranges } - /// Returns true if this selection would skip any data pages within the provided columns - fn selection_skips_any_page( + pub(crate) fn selected_page_row_ranges( &self, - projection: &ProjectionMask, - columns: &[OffsetIndexMetaData], - ) -> bool { - columns.iter().enumerate().any(|(leaf_idx, column)| { - if !projection.leaf_included(leaf_idx) { - return false; - } - - let locations = column.page_locations(); - if locations.is_empty() { - return false; - } - - let ranges = self.scan_ranges(locations); - !ranges.is_empty() && ranges.len() < locations.len() - }) - } + page_locations: &[PageLocation], + total_rows: usize, + ) -> Vec> { + let selected_byte_ranges = self.scan_ranges(page_locations); + page_locations + .iter() + .enumerate() + .filter_map(|(idx, page)| { + let start = page.offset as u64; + let end = start + page.compressed_page_size as u64; + let page_is_selected = selected_byte_ranges + .iter() + .any(|range| range.start == start && range.end == end); + if !page_is_selected { + return None; + } - /// Returns true if selectors should be forced, preventing mask materialisation - pub(crate) fn should_force_selectors( - &self, - projection: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, - ) -> bool { - match offset_index { - Some(columns) => self.selection_skips_any_page(projection, columns), - None => false, - } + let row_start = page.first_row_index as usize; + let row_end = page_locations + .get(idx + 1) + .map(|next| next.first_row_index as usize) + .unwrap_or(total_rows); + Some(row_start..row_end) + }) + .collect() } /// Splits off the first `row_count` from this [`RowSelection`] @@ -430,6 +661,10 @@ impl RowSelection { self.selectors.iter().any(|x| !x.skip) } + pub(crate) fn boolean_mask(&self) -> BooleanBuffer { + boolean_mask_from_selectors(&self.selectors) + } + /// Trims this [`RowSelection`] removing any trailing skips pub(crate) fn trim(mut self) -> Self { while self.selectors.last().map(|x| x.skip).unwrap_or(false) { @@ -767,27 +1002,51 @@ fn union_row_selections(left: &[RowSelector], right: &[RowSelector]) -> RowSelec /// or selections. For example, selecting every other row. #[derive(Debug)] pub struct MaskCursor { - mask: BooleanBuffer, - /// Current absolute offset into the selection - position: usize, + inner: MaskCursorInner, +} + +#[derive(Debug)] +enum MaskCursorInner { + Dense { + mask: BooleanBuffer, + /// Current absolute offset into the selection + position: usize, + }, + Sparse(SparseMaskCursor), } impl MaskCursor { /// Returns `true` when no further rows remain pub fn is_empty(&self) -> bool { - self.position >= self.mask.len() + match &self.inner { + MaskCursorInner::Dense { mask, position } => *position >= mask.len(), + MaskCursorInner::Sparse(cursor) => cursor.is_empty(), + } + } + + pub(crate) fn is_sparse(&self) -> bool { + matches!(self.inner, MaskCursorInner::Sparse(_)) + } + + pub(crate) fn sparse_mut(&mut self) -> Option<&mut SparseMaskCursor> { + match &mut self.inner { + MaskCursorInner::Sparse(cursor) => Some(cursor), + MaskCursorInner::Dense { .. } => None, + } } /// Advance through the mask representation, producing the next chunk summary pub fn next_mask_chunk(&mut self, batch_size: usize) -> Option { - let (initial_skip, chunk_rows, selected_rows, mask_start, end_position) = { - let mask = &self.mask; + let MaskCursorInner::Dense { mask, position } = &mut self.inner else { + return None; + }; - if self.position >= mask.len() { + let (initial_skip, chunk_rows, selected_rows, mask_start, end_position) = { + if *position >= mask.len() { return None; } - let start_position = self.position; + let start_position = *position; let mut cursor = start_position; let mut initial_skip = 0; @@ -814,7 +1073,7 @@ impl MaskCursor { (initial_skip, chunk_rows, selected_rows, mask_start, cursor) }; - self.position = end_position; + *position = end_position; Some(MaskChunk { initial_skip, @@ -826,13 +1085,19 @@ impl MaskCursor { /// Materialise the boolean values for a mask-backed chunk pub fn mask_values_for(&self, chunk: &MaskChunk) -> Result { - if chunk.mask_start.saturating_add(chunk.chunk_rows) > self.mask.len() { + let MaskCursorInner::Dense { mask, .. } = &self.inner else { + return Err(ParquetError::General( + "Internal Error: dense mask chunk requested from sparse mask cursor".to_string(), + )); + }; + + if chunk.mask_start.saturating_add(chunk.chunk_rows) > mask.len() { return Err(ParquetError::General( "Internal Error: MaskChunk exceeds mask length".to_string(), )); } Ok(BooleanArray::from( - self.mask.slice(chunk.mask_start, chunk.chunk_rows), + mask.slice(chunk.mask_start, chunk.chunk_rows), )) } } @@ -885,11 +1150,155 @@ pub struct MaskChunk { pub mask_start: usize, } +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct LoadedRowRanges { + ranges: Vec>, + total_rows: usize, +} + +impl LoadedRowRanges { + pub(crate) fn new(ranges: Vec>, total_rows: usize) -> Self { + debug_assert!( + ranges + .windows(2) + .all(|window| window[0].end <= window[1].start), + "loaded row ranges must be sorted and non-overlapping" + ); + debug_assert!( + ranges + .iter() + .all(|range| range.start <= range.end && range.end <= total_rows), + "loaded row ranges must be valid within total_rows" + ); + Self { ranges, total_rows } + } + + pub(crate) fn is_sparse(&self) -> bool { + match self.ranges.as_slice() { + [] => self.total_rows != 0, + [range] => range.start != 0 || range.end != self.total_rows, + _ => true, + } + } + + fn range_containing(&self, row: usize) -> Option<&Range> { + self.ranges + .iter() + .find(|range| range.start <= row && row < range.end) + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct MaskSegment { + pub row_range: Range, + pub mask_start: usize, + pub mask_len: usize, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct SparseMaskChunk { + pub segments: Vec, + pub selected_rows: usize, +} + +#[derive(Debug)] +pub(crate) struct SparseMaskCursor { + mask: BooleanBuffer, + loaded: LoadedRowRanges, + position: usize, +} + +impl SparseMaskCursor { + pub(crate) fn new(selectors: Vec, loaded: LoadedRowRanges) -> Self { + Self { + mask: boolean_mask_from_selectors(&selectors), + loaded, + position: 0, + } + } + + pub(crate) fn is_empty(&self) -> bool { + self.position >= self.mask.len() || self.position >= self.loaded.total_rows + } + + pub(crate) fn mask_values_for( + &self, + segment: &MaskSegment, + ) -> Result { + if segment.mask_start.saturating_add(segment.mask_len) > self.mask.len() { + return Err(ParquetError::General( + "Internal Error: sparse mask segment exceeds mask length".to_string(), + )); + } + Ok(BooleanArray::from( + self.mask.slice(segment.mask_start, segment.mask_len), + )) + } + + pub(crate) fn next_sparse_mask_chunk( + &mut self, + batch_size: usize, + ) -> Result, ParquetError> { + if self.is_empty() { + return Ok(None); + } + + let mut selected_rows = 0usize; + let mut segments = Vec::new(); + let mut cursor = self.position; + + while cursor < self.mask.len() + && cursor < self.loaded.total_rows + && selected_rows < batch_size + { + if !self.mask.value(cursor) { + cursor += 1; + continue; + } + + let Some(loaded) = self.loaded.range_containing(cursor) else { + return Err(ParquetError::General(format!( + "Internal Error: sparse mask selected row {cursor} outside loaded row ranges" + ))); + }; + + let segment_start = cursor; + let mut segment_end = cursor; + while segment_end < loaded.end + && segment_end < self.mask.len() + && selected_rows < batch_size + && self.mask.value(segment_end) + { + selected_rows += 1; + segment_end += 1; + } + + segments.push(MaskSegment { + row_range: segment_start..segment_end, + mask_start: segment_start, + mask_len: segment_end - segment_start, + }); + cursor = segment_end; + } + + self.position = cursor; + if segments.is_empty() { + self.position = self.mask.len().min(self.loaded.total_rows); + return Ok(None); + } + + Ok(Some(SparseMaskChunk { + segments, + selected_rows, + })) + } +} + /// Cursor for iterating a [`RowSelection`] during execution within a /// [`ReadPlan`](crate::arrow::arrow_reader::ReadPlan). /// -/// This keeps per-reader state such as the current position and delegates the -/// actual storage strategy to the internal `RowSelectionBacking`. +/// This keeps per-reader state such as the current position and delegates dense +/// or sparse mask state to [`MaskCursor`]. #[derive(Debug)] pub enum RowSelectionCursor { /// Reading all rows @@ -904,8 +1313,20 @@ impl RowSelectionCursor { /// Create a [`MaskCursor`] cursor backed by a bitmask, from an existing set of selectors pub(crate) fn new_mask_from_selectors(selectors: Vec) -> Self { Self::Mask(MaskCursor { - mask: boolean_mask_from_selectors(&selectors), - position: 0, + inner: MaskCursorInner::Dense { + mask: boolean_mask_from_selectors(&selectors), + position: 0, + }, + }) + } + + /// Create a [`SparseMaskCursor`] from the provided selectors and loaded row ranges + pub(crate) fn new_sparse_mask_from_selectors( + selectors: Vec, + loaded: LoadedRowRanges, + ) -> Self { + Self::Mask(MaskCursor { + inner: MaskCursorInner::Sparse(SparseMaskCursor::new(selectors, loaded)), }) } @@ -937,6 +1358,78 @@ mod tests { use super::*; use rand::{Rng, rng}; + #[test] + fn test_loaded_row_ranges_detects_sparse_ranges() { + assert!(!LoadedRowRanges::new(vec![0..6], 6).is_sparse()); + assert!(!LoadedRowRanges::new(vec![], 0).is_sparse()); + assert!(LoadedRowRanges::new(vec![0..2, 4..6], 6).is_sparse()); + assert!(LoadedRowRanges::new(vec![1..6], 6).is_sparse()); + } + + #[test] + fn test_sparse_mask_cursor_skips_unloaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + + let loaded = LoadedRowRanges::new(vec![0..2, 4..6], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let chunk = cursor.next_sparse_mask_chunk(1024).unwrap().unwrap(); + assert_eq!(chunk.selected_rows, 2); + assert_eq!( + chunk.segments, + vec![ + MaskSegment { + row_range: 0..1, + mask_start: 0, + mask_len: 1, + }, + MaskSegment { + row_range: 5..6, + mask_start: 5, + mask_len: 1, + }, + ] + ); + assert!(cursor.is_empty()); + } + + #[test] + fn test_sparse_mask_cursor_errors_selected_rows_after_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::skip(5), RowSelector::select(1)]); + + let loaded = LoadedRowRanges::new(vec![0..2], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 5 outside loaded row ranges"), + "{err}" + ); + } + + #[test] + fn test_sparse_mask_cursor_exhausts_empty_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::select(6)]); + + let loaded = LoadedRowRanges::new(vec![], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 0 outside loaded row ranges"), + "{err}" + ); + } + #[test] fn test_from_filters() { let filters = vec![ @@ -1537,6 +2030,37 @@ mod tests { assert_eq!(ranges, vec![10..20, 20..30, 30..40]); } + #[test] + fn test_selected_page_row_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let pages = vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ]; + + assert_eq!( + selection.selected_page_row_ranges(&pages, 6), + vec![0..2, 4..6] + ); + } + #[test] fn test_from_ranges() { let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 4c667e534366..8b358ad61ad8 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -415,9 +415,10 @@ impl ParquetDecoderState { Self::ReadingRowGroup { .. } => current_state = next_state, // have a reader ready, so return it and set ourself to ReadingRowGroup Self::DecodingRowGroup { - record_batch_reader, + mut record_batch_reader, remaining_row_groups, } => { + record_batch_reader.materialize_post_filter()?; let result = DecodeResult::Data(*record_batch_reader); let next_state = Self::ReadingRowGroup { remaining_row_groups, @@ -602,7 +603,11 @@ impl ParquetDecoderState { mod test { use super::*; use crate::DecodeResult; - use crate::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection, RowSelector}; + use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; + use crate::arrow::arrow_reader::{ + ArrowPredicateFn, ParquetRecordBatchReader, RowFilter, RowSelection, RowSelectionPolicy, + RowSelector, + }; use crate::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; use crate::arrow::{ArrowWriter, ProjectionMask}; use crate::errors::ParquetError; @@ -611,11 +616,13 @@ mod test { use arrow::compute::kernels::cmp::{gt, lt}; use arrow_array::cast::AsArray; use arrow_array::types::Int64Type; - use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringViewArray}; + use arrow_array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray}; use arrow_select::concat::concat_batches; + use arrow_select::filter::filter_record_batch; use bytes::Bytes; use std::fmt::Debug; use std::ops::Range; + use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, LazyLock}; /// Test decoder struct size (as they are copied around on each transition, they @@ -1054,6 +1061,243 @@ mod test { expect_finished(decoder.try_decode()); } + #[test] + fn test_decoder_auto_fallback_uses_post_filter_after_observation() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 100); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH.slice(0, 100).project(&[2]).unwrap() + ); + assert!(reader.next().is_none()); + + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 200); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH.slice(100, 100).project(&[2]).unwrap() + ); + assert!(reader.next().is_none()); + + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + 300, + "fallback should evaluate predicates before returning the reader to preserve row-group order" + ); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH.slice(200, 100).project(&[2]).unwrap() + ); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 300); + assert!(reader.next().is_none()); + + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH.slice(300, 100).project(&[2]).unwrap() + ); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert!(reader.next().is_none()); + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.fallback_high_selectivity_no_pruning_count(), + Some(1) + ); + assert!(next_reader_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_fallback_post_filter_applies_fragmented_filter() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(not_multiple_of_three_filter(&batch)) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..2 { + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert_eq!( + reader.next().unwrap().unwrap(), + expected_c_not_multiple_of_three(row_group_idx * 100, 100) + ); + assert!(reader.next().is_none()); + } + + for row_group_idx in 2..4 { + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100, + "fallback should evaluate predicates before returning the reader to preserve row-group order" + ); + assert_eq!( + reader.next().unwrap().unwrap(), + expected_c_not_multiple_of_three(row_group_idx * 100, 100) + ); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert!(reader.next().is_none()); + } + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.fallback_fragmented_high_selectivity_materialization_count(), + Some(1) + ); + assert!(next_reader_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_fallback_records_fragmented_moderate_selectivity() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(multiple_of_ten_filter(&batch)) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert_eq!( + reader.next().unwrap().unwrap(), + expected_c_multiple_of_ten(row_group_idx * 100, 100) + ); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert!(reader.next().is_none()); + } + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.fallback_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_reader_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_fallback_current_row_uses_predicate_cache() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH.slice(0, 100).project(&[0, 2]).unwrap() + ); + assert!(reader.next().is_none()); + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(1)); + assert_eq!(metrics.records_read_from_cache(), Some(100)); + } + #[test] fn test_decoder_empty_filters() { let builder = @@ -1159,6 +1403,50 @@ mod test { ); } + /// Auto post-filter fallback is disabled for `LIMIT` because the limit is + /// applied during row-group planning. Limit scans should therefore avoid + /// fallback observation bookkeeping entirely. + #[test] + fn test_decoder_filter_with_limit_skips_auto_fallback_observation() { + let builder = + ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + |batch: RecordBatch| { + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .with_limit(10) + .build() + .unwrap(); + + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + let batch = expect_data(decoder.try_decode()); + let expected = TEST_BATCH.slice(0, 10).project(&[2]).unwrap(); + assert_eq!(batch, expected); + expect_finished(decoder.try_decode()); + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(0)); + assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(0)); + } + /// Once the limit has been satisfied by a prior row group, subsequent /// row groups should be skipped entirely — no data request for their /// filter columns. @@ -1493,13 +1781,17 @@ mod test { /// c | "string_100".."string_199" | 2 | 0 /// c | "string_200".."string_299" | 1 | 1 /// c | "string_300".."string_399" | 2 | 1 - static TEST_FILE_DATA: LazyLock = LazyLock::new(|| { + static TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(200, 100)); + + static FALLBACK_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); + + fn write_test_file(max_row_group_row_count: usize, data_page_row_count_limit: usize) -> Bytes { let input_batch = &TEST_BATCH; let mut output = Vec::new(); let writer_options = WriterProperties::builder() - .set_max_row_group_row_count(Some(200)) - .set_data_page_row_count_limit(100) + .set_max_row_group_row_count(Some(max_row_group_row_count)) + .set_data_page_row_count_limit(data_page_row_count_limit) .build(); let mut writer = ArrowWriter::try_new(&mut output, input_batch.schema(), Some(writer_options)).unwrap(); @@ -1515,7 +1807,7 @@ mod test { } writer.close().unwrap(); Bytes::from(output) - }); + } /// Return the length of [`TEST_FILE_DATA`], in bytes fn test_file_len() -> u64 { @@ -1527,17 +1819,18 @@ mod test { 0..test_file_len() } - /// Return a slice of the test file data from the given range - pub fn test_file_slice(range: Range) -> Bytes { - let start: usize = range.start.try_into().unwrap(); - let end: usize = range.end.try_into().unwrap(); - TEST_FILE_DATA.slice(start..end) - } - /// return the metadata for the test file pub fn test_file_parquet_metadata() -> Arc { - let mut metadata_decoder = ParquetMetaDataPushDecoder::try_new(test_file_len()).unwrap(); - push_ranges_to_metadata_decoder(&mut metadata_decoder, vec![test_file_range()]); + parquet_metadata_for_data(&TEST_FILE_DATA) + } + + fn parquet_metadata_for_data(data: &Bytes) -> Arc { + let mut metadata_decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap(); + push_ranges_to_metadata_decoder_with_data( + &mut metadata_decoder, + vec![0..data.len() as u64], + data, + ); let metadata = metadata_decoder.try_decode().unwrap(); let DecodeResult::Data(metadata) = metadata else { panic!("Expected metadata to be decoded successfully"); @@ -1545,26 +1838,84 @@ mod test { Arc::new(metadata) } - /// Push the given ranges to the metadata decoder, simulating reading from a file - fn push_ranges_to_metadata_decoder( + fn push_ranges_to_metadata_decoder_with_data( metadata_decoder: &mut ParquetMetaDataPushDecoder, ranges: Vec>, + data: &Bytes, ) { let data = ranges .iter() - .map(|range| test_file_slice(range.clone())) + .map(|range| data.slice(range.start as usize..range.end as usize)) .collect::>(); metadata_decoder.push_ranges(ranges, data).unwrap(); } fn push_ranges_to_decoder(decoder: &mut ParquetPushDecoder, ranges: Vec>) { + push_ranges_to_decoder_with_data(decoder, ranges, &TEST_FILE_DATA); + } + + fn push_ranges_to_decoder_with_data( + decoder: &mut ParquetPushDecoder, + ranges: Vec>, + data: &Bytes, + ) { let data = ranges .iter() - .map(|range| test_file_slice(range.clone())) + .map(|range| data.slice(range.start as usize..range.end as usize)) .collect::>(); decoder.push_ranges(ranges, data).unwrap(); } + fn not_multiple_of_three_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 3 != 0) + .collect::>(), + ) + } + + fn expected_c_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn multiple_of_ten_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 10 == 0) + .collect::>(), + ) + } + + fn expected_c_multiple_of_ten(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_ten_filter(&batch); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn next_reader_with_data( + decoder: &mut ParquetPushDecoder, + data: &Bytes, + ) -> Option { + loop { + match decoder + .try_next_reader() + .expect("decoder should produce a reader or request data") + { + DecodeResult::NeedsData(ranges) => { + push_ranges_to_decoder_with_data(decoder, ranges, data); + } + DecodeResult::Data(reader) => return Some(reader), + DecodeResult::Finished => return None, + } + } + } + /// Expect that the [`DecodeResult`] is a [`DecodeResult::Data`] and return the corresponding element fn expect_data(result: Result, ParquetError>) -> T { match result.expect("Expected Ok(DecodeResult::Data(T))") { diff --git a/parquet/src/arrow/push_decoder/reader_builder/data.rs b/parquet/src/arrow/push_decoder/reader_builder/data.rs index 6fbc2090b06e..af7cb306aa2c 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/data.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/data.rs @@ -54,6 +54,10 @@ impl DataRequest { .collect() } + pub(super) fn into_column_chunks(self) -> Vec>> { + self.column_chunks + } + /// Returns the chunks from the buffers that satisfy this request fn get_chunks(&self, buffers: &PushBuffers) -> Result, ParquetError> { self.ranges diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 60e50d29524e..127c09d2d4f5 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -22,17 +22,21 @@ use crate::DecodeResult; use crate::arrow::ProjectionMask; use crate::arrow::array_reader::{ArrayReaderBuilder, CacheOptions, RowGroupCache}; use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; -use crate::arrow::arrow_reader::selection::RowSelectionStrategy; +use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, LoadedRowRanges, RowGroupExecutionMode, + RowSelectionShape, RowSelectionStrategy, RowSelectionStrategyDecision, +}; use crate::arrow::arrow_reader::{ ParquetRecordBatchReader, PredicateOptions, ReadPlanBuilder, RowFilter, RowSelection, - RowSelectionPolicy, + RowSelectionPolicy, RowSelector, }; use crate::arrow::in_memory_row_group::ColumnChunkData; use crate::arrow::push_decoder::reader_builder::data::DataRequestBuilder; use crate::arrow::push_decoder::reader_builder::filter::CacheInfo; -use crate::arrow::schema::ParquetField; +use crate::arrow::schema::{ParquetField, ParquetFieldType}; +use crate::basic::Type as PhysicalType; use crate::errors::ParquetError; -use crate::file::metadata::ParquetMetaData; +use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::util::push_buffers::PushBuffers; use bytes::Bytes; @@ -40,7 +44,7 @@ use data::DataRequest; use filter::AdvanceResult; use filter::FilterInfo; use std::ops::Range; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; /// The current row group being read and the read plan #[derive(Debug)] @@ -48,6 +52,7 @@ struct RowGroupInfo { row_group_idx: usize, row_count: usize, plan_builder: ReadPlanBuilder, + base_selection: Option, } /// This is the inner state machine for reading a single row group. @@ -77,6 +82,21 @@ enum RowGroupDecoderState { /// Any cached filter results cache_info: Option, }, + /// Needs data to read the row group once and apply the filter after decode. + WaitingOnPostFilterData { + row_group_info: RowGroupInfo, + data_request: DataRequest, + read_projection: ProjectionMask, + filter: Arc>, + }, + /// Needs data to read the row group once and apply an already-computed + /// fallback selection after decode. + WaitingOnPostSelectionData { + row_group_info: RowGroupInfo, + data_request: DataRequest, + selection: RowSelection, + cache_info: Option, + }, /// Needs data to proceed with reading the output WaitingOnData { row_group_info: RowGroupInfo, @@ -88,6 +108,22 @@ enum RowGroupDecoderState { Finished, } +#[allow(dead_code)] +#[derive(Debug)] +enum RowGroupFallbackState { + Observing { observation: FallbackObservation }, + UsePushdown, + UsePostFilter { reason: FallbackTriggerReason }, +} + +impl Default for RowGroupFallbackState { + fn default() -> Self { + Self::Observing { + observation: FallbackObservation::default(), + } + } +} + /// Result of a state transition #[derive(Debug)] struct NextState { @@ -144,6 +180,9 @@ pub(crate) struct RowGroupReaderBuilder { /// Optional filter filter: Option, + /// Shared filter state used once Auto fallback switches to post-filter. + post_filter: Option>>, + /// Limit to apply to remaining row groups (decremented as rows are read) limit: Option, @@ -161,6 +200,10 @@ pub(crate) struct RowGroupReaderBuilder { /// Strategy for materialising row selections row_selection_policy: RowSelectionPolicy, + /// Row-group-local fallback state used by Auto policy. + #[allow(dead_code)] + fallback_state: RowGroupFallbackState, + /// Current state of the decoder. /// /// It is taken when processing, and must be put back before returning @@ -193,11 +236,13 @@ impl RowGroupReaderBuilder { metadata, fields, filter, + post_filter: None, limit, offset, metrics, max_predicate_cache_size, row_selection_policy, + fallback_state: RowGroupFallbackState::default(), state: Some(RowGroupDecoderState::Finished), buffers, } @@ -247,13 +292,14 @@ impl RowGroupReaderBuilder { ))); } let plan_builder = ReadPlanBuilder::new(self.batch_size) - .with_selection(selection) + .with_selection(selection.clone()) .with_row_selection_policy(self.row_selection_policy); let row_group_info = RowGroupInfo { row_group_idx, row_count, plan_builder, + base_selection: selection, }; self.state = Some(RowGroupDecoderState::Start { row_group_info }); @@ -325,6 +371,10 @@ impl RowGroupReaderBuilder { let column_chunks = None; // no prior column chunks + if let Some(filter) = self.post_filter.as_ref().cloned() { + return self.start_post_filter(row_group_info, filter); + } + let Some(filter) = self.filter.take() else { // no filter, start trying to read data immediately return Ok(NextState::again(RowGroupDecoderState::StartData { @@ -342,6 +392,16 @@ impl RowGroupReaderBuilder { })); }; + if self.should_use_post_filter_fallback() { + if self.post_filter_read_projection(&filter).is_some() { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + return self.start_post_filter(row_group_info, filter); + } + + self.fallback_state = RowGroupFallbackState::UsePushdown; + } + // we have predicates to evaluate let cache_projection = self.compute_cache_projection(row_group_info.row_group_idx, &filter); @@ -371,6 +431,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, plan_builder, + base_selection, } = row_group_info; // If nothing is selected, we are done with this row group @@ -405,6 +466,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, plan_builder, + base_selection, }; NextState::again(RowGroupDecoderState::WaitingOnFilterData { @@ -437,6 +499,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, mut plan_builder, + base_selection, } = row_group_info; let predicate = filter_info.current(); @@ -457,20 +520,17 @@ impl RowGroupReaderBuilder { .with_parquet_metadata(&self.metadata) .build_array_reader(self.fields.as_deref(), predicate.projection())?; - // Reset to original policy before each predicate so the override - // can detect page skipping for THIS predicate's columns. - // Without this reset, a prior predicate's override (e.g. Mask) - // carries forward and the check returns early, missing unfetched - // pages for subsequent predicates. plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - - // Prepare to evaluate the filter. - // Note: first update the selection strategy to properly handle any pages - // pruned during fetch - plan_builder = override_selector_strategy_if_needed( + plan_builder = resolve_selection_policy_for_expensive_output( plan_builder, predicate.projection(), self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + predicate.projection(), + row_count, + ), ); // When this is the final predicate in the chain and an output @@ -495,6 +555,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, plan_builder, + base_selection, }; // Take back the column chunks that were read @@ -531,6 +592,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, plan_builder, + base_selection, } = row_group_info; // Compute the number of rows in the selection before applying limit and offset @@ -586,17 +648,23 @@ impl RowGroupReaderBuilder { .build(); plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - - plan_builder = override_selector_strategy_if_needed( + plan_builder = resolve_selection_policy_for_expensive_output( plan_builder, &self.projection, self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + &self.projection, + row_count, + ), ); let row_group_info = RowGroupInfo { row_group_idx, row_count, plan_builder, + base_selection, }; NextState::again(RowGroupDecoderState::WaitingOnData { @@ -605,12 +673,170 @@ impl RowGroupReaderBuilder { cache_info, }) } + RowGroupDecoderState::WaitingOnPostFilterData { + row_group_info, + data_request, + read_projection, + filter, + } => { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostFilterData { + row_group_info, + data_request, + read_projection, + filter, + }, + DecodeResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &read_projection, + &mut self.buffers, + )?; + + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata) + .build_array_reader(self.fields.as_deref(), &read_projection)?; + + let reader = ParquetRecordBatchReader::new_post_filter( + array_reader, + plan, + filter, + self.metadata.file_metadata().schema_descr(), + &read_projection, + &self.projection, + )?; + + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::PostFilter); + + NextState::result(RowGroupDecoderState::Finished, DecodeResult::Data(reader)) + } + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + } => { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + }, + DecodeResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &self.projection, + &mut self.buffers, + )?; + + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader_builder = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata); + let array_reader = if let Some(cache_info) = cache_info.as_ref() { + let cache_options: CacheOptions = cache_info.builder().consumer(); + array_reader_builder + .with_cache_options(Some(&cache_options)) + .build_array_reader(self.fields.as_deref(), &self.projection) + } else { + array_reader_builder + .build_array_reader(self.fields.as_deref(), &self.projection) + }?; + + let reader = ParquetRecordBatchReader::new_post_selection_filter( + array_reader, + plan, + selection, + ); + + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::PostFilter); + + NextState::result(RowGroupDecoderState::Finished, DecodeResult::Data(reader)) + } // Waiting on data to proceed with reading the output RowGroupDecoderState::WaitingOnData { row_group_info, data_request, cache_info, } => { + if cache_info.is_some() + && matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) + && self.post_filter_fallback_supported() + { + let decision = row_group_info + .plan_builder + .resolve_selection_strategy_decision(); + let fallback_selection = row_group_info.plan_builder.selection().cloned(); + self.observe_fallback_candidate(decision, row_group_info.row_count); + + if matches!( + self.fallback_state, + RowGroupFallbackState::UsePostFilter { .. } + ) { + if row_group_info.base_selection.is_none() { + let selection = fallback_selection.unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select( + row_group_info.row_count, + )]) + }); + let column_chunks = data_request.into_column_chunks(); + return self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + Some(column_chunks), + ); + } + + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General( + "post-filter fallback selected without a row filter".to_string(), + ) + })?; + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + return self.start_post_filter(row_group_info, filter); + } + + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::Pushdown( + decision.strategy, + )); + } + let needed_ranges = data_request.needed_ranges(&self.buffers); if !needed_ranges.is_empty() { // still need data @@ -629,6 +855,7 @@ impl RowGroupReaderBuilder { row_group_idx, row_count, plan_builder, + base_selection: _, } = row_group_info; let row_group = data_request.try_into_in_memory_row_group( @@ -639,7 +866,7 @@ impl RowGroupReaderBuilder { &mut self.buffers, )?; - let plan = plan_builder.build(); + let plan = plan_builder.build_with_metrics(&self.metrics); // if we have any cached results, connect them up let array_reader_builder = ArrayReaderBuilder::new(&row_group, &self.metrics) @@ -666,6 +893,223 @@ impl RowGroupReaderBuilder { Ok(result) } + fn start_post_filter( + &mut self, + row_group_info: RowGroupInfo, + filter: Arc>, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + base_selection, + .. + } = row_group_info; + + let mut plan_builder = ReadPlanBuilder::new(self.batch_size) + .with_selection(base_selection) + .with_row_selection_policy(self.row_selection_policy); + + if !plan_builder.selects_any() { + return Ok(NextState::result( + RowGroupDecoderState::Finished, + DecodeResult::Finished, + )); + } + + let read_projection = { + let filter = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + self.post_filter_read_projection(&filter).ok_or_else(|| { + ParquetError::General( + "post-filter fallback selected an unsupported projection".to_string(), + ) + })? + }; + + let data_request = DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &read_projection, + ) + .with_selection(plan_builder.selection()) + .build(); + + plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); + plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &read_projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + &read_projection, + row_count, + ), + ); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: None, + }; + + Ok(NextState::again( + RowGroupDecoderState::WaitingOnPostFilterData { + row_group_info, + data_request, + read_projection, + filter, + }, + )) + } + + fn start_post_selection_filter( + &mut self, + row_group_info: RowGroupInfo, + selection: RowSelection, + cache_info: Option, + column_chunks: Option>>>, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + base_selection, + .. + } = row_group_info; + + let plan_builder = ReadPlanBuilder::new(self.batch_size) + .with_selection(base_selection) + .with_row_selection_policy(self.row_selection_policy); + + let data_request = DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &self.projection, + ) + .with_selection(plan_builder.selection()) + .with_column_chunks(column_chunks) + .build(); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: None, + }; + + Ok(NextState::again( + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + }, + )) + } + + fn should_use_post_filter_fallback(&self) -> bool { + matches!( + self.fallback_state, + RowGroupFallbackState::UsePostFilter { .. } + ) && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + && self.limit.is_none() + && self.offset.is_none() + && !self.has_virtual_columns() + } + + fn post_filter_read_projection(&self, filter: &RowFilter) -> Option { + if !self.should_use_post_filter_fallback() { + return None; + } + + self.build_post_filter_read_projection(filter) + } + + fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { + let mut read_projection = self.projection.clone(); + read_projection.union(&filter.union_projection()?); + + if self.post_filter_supports_projection(&read_projection) { + Some(read_projection) + } else { + None + } + } + + fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()).all(|leaf_idx| { + !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() + }) + } + + fn observe_fallback_candidate( + &mut self, + decision: RowSelectionStrategyDecision, + row_count: usize, + ) { + if !matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) { + return; + } + + let RowGroupFallbackState::Observing { observation } = &mut self.fallback_state else { + return; + }; + + let mut shape = decision.shape; + if shape.total_rows() == 0 { + shape = RowSelectionShape { + selected_rows: row_count, + skipped_rows: 0, + selector_count: 1, + selected_run_count: 1, + skipped_run_count: 0, + }; + } + + observation.observed_row_groups += 1; + observation.shape.add_assign(shape); + self.metrics.record_fallback_observed_row_group(); + + let reason = observation.trigger_reason(); + if matches!(reason, FallbackTriggerReason::ObservationIncomplete) { + self.metrics.record_fallback_trigger(reason); + return; + } + + let should_fallback = observation.should_fallback(); + self.metrics.record_fallback_trigger(reason); + + if should_fallback && self.post_filter_fallback_supported() { + self.fallback_state = RowGroupFallbackState::UsePostFilter { reason }; + } else { + self.fallback_state = RowGroupFallbackState::UsePushdown; + } + } + + fn post_filter_fallback_supported(&self) -> bool { + let Some(filter) = self.filter.as_ref() else { + return false; + }; + matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + && self.limit.is_none() + && self.offset.is_none() + && !self.has_virtual_columns() + && self.build_post_filter_read_projection(filter).is_some() + } + + fn has_virtual_columns(&self) -> bool { + self.fields + .as_deref() + .is_some_and(parquet_field_has_virtual_columns) + } + /// Which columns should be cached? /// /// Returns the columns that are used by the filters *and* then used in the @@ -706,64 +1150,490 @@ impl RowGroupReaderBuilder { } } -/// Override the selection strategy if needed. -/// -/// Some pages can be skipped during row-group construction if they are not read -/// by the selections. This means that the data pages for those rows are never -/// loaded and definition/repetition levels are never read. When using -/// `RowSelections` selection works because `skip_records()` handles this -/// case and skips the page accordingly. -/// -/// However, with the current mask design, all values must be read and decoded -/// and then a mask filter is applied. Thus if any pages are skipped during -/// row-group construction, the data pages are missing and cannot be decoded. -/// -/// A simple example: -/// * the page size is 2, the mask is 100001, row selection should be read(1) skip(4) read(1) -/// * the `ColumnChunkData` would be page1(10), page2(skipped), page3(01) -/// -/// Using the row selection to skip(4), page2 won't be read at all, so in this -/// case we can't decode all the rows and apply a mask. To correctly apply the -/// bit mask, we need all 6 values be read, but page2 is not in memory. -fn override_selector_strategy_if_needed( +fn parquet_field_has_virtual_columns(field: &ParquetField) -> bool { + match &field.field_type { + ParquetFieldType::Primitive { .. } => false, + ParquetFieldType::Group { children } => { + children.iter().any(parquet_field_has_virtual_columns) + } + ParquetFieldType::Virtual(_) => true, + } +} + +#[cfg(test)] +fn resolve_selection_policy_for_projection( plan_builder: ReadPlanBuilder, projection_mask: &ProjectionMask, offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, ) -> ReadPlanBuilder { - // override only applies to Auto policy, If the policy is already Mask or Selectors, respect that - let RowSelectionPolicy::Auto { .. } = plan_builder.row_selection_policy() else { - return plan_builder; - }; + resolve_selection_policy_for_expensive_output( + plan_builder, + projection_mask, + offset_index, + total_rows, + ExpensiveOutputProfile::default(), + ) +} + +fn resolve_selection_policy_for_expensive_output( + plan_builder: ReadPlanBuilder, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, + output_profile: ExpensiveOutputProfile, +) -> ReadPlanBuilder { + let loaded = loaded_ranges_for_projection( + plan_builder.selection(), + projection_mask, + offset_index, + total_rows, + ); + let loaded_is_sparse = loaded.as_ref().is_some_and(LoadedRowRanges::is_sparse); + let sparse_loaded = loaded.filter(LoadedRowRanges::is_sparse); + + match plan_builder.row_selection_policy() { + RowSelectionPolicy::Auto { .. } => { + let decision = plan_builder.resolve_selection_strategy_decision(); + match decision.strategy { + RowSelectionStrategy::Mask + if loaded_is_sparse + || should_prefer_selectors_for_expensive_output( + decision.shape, + output_profile, + ) => + { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + RowSelectionStrategy::Mask => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Mask) + } + RowSelectionStrategy::Selectors => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + } + } + RowSelectionPolicy::Mask => plan_builder.with_loaded_row_ranges(sparse_loaded), + RowSelectionPolicy::Selectors => plan_builder, + } +} + +#[derive(Clone, Copy, Debug, Default)] +struct ExpensiveOutputProfile { + variable_width_columns: usize, + uncompressed_bytes_per_row: f64, +} + +impl ExpensiveOutputProfile { + fn from_row_group( + row_group: &RowGroupMetaData, + projection_mask: &ProjectionMask, + total_rows: usize, + ) -> Self { + if total_rows == 0 { + return Self::default(); + } + + let mut variable_width_columns = 0; + let mut uncompressed_bytes = 0u64; + for leaf_idx in 0..row_group.num_columns() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + variable_width_columns += 1; + } + uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + Self { + variable_width_columns, + uncompressed_bytes_per_row: uncompressed_bytes as f64 / total_rows as f64, + } + } +} - let preferred_strategy = plan_builder.resolve_selection_strategy(); +fn should_prefer_selectors_for_expensive_output( + shape: RowSelectionShape, + output_profile: ExpensiveOutputProfile, +) -> bool { + let selected_ratio = shape.selected_ratio(); + output_profile.variable_width_columns > 0 + && output_profile.uncompressed_bytes_per_row >= 16.0 + && selected_ratio > 0.0 + && selected_ratio < 0.10 + && shape.average_selected_run_length() <= 4.0 +} - let force_selectors = matches!(preferred_strategy, RowSelectionStrategy::Mask) - && plan_builder.selection().is_some_and(|selection| { - selection.should_force_selectors(projection_mask, offset_index) +fn loaded_ranges_for_projection( + selection: Option<&RowSelection>, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, +) -> Option { + let selection = selection?; + let columns = offset_index?; + let mut ranges: Option>> = None; + + for (leaf_idx, column) in columns.iter().enumerate() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + let column_ranges = selection.selected_page_row_ranges(column.page_locations(), total_rows); + ranges = Some(match ranges { + Some(existing) => intersect_ranges(existing, column_ranges), + None => column_ranges, }); + } - let resolved_strategy = if force_selectors { - RowSelectionStrategy::Selectors - } else { - preferred_strategy - }; + ranges.map(|ranges| LoadedRowRanges::new(coalesce_adjacent_ranges(ranges), total_rows)) +} - // override the plan builder strategy with the resolved one - let new_policy = match resolved_strategy { - RowSelectionStrategy::Mask => RowSelectionPolicy::Mask, - RowSelectionStrategy::Selectors => RowSelectionPolicy::Selectors, - }; +fn intersect_ranges(left: Vec>, right: Vec>) -> Vec> { + let mut out = Vec::new(); + for l in &left { + for r in &right { + let start = l.start.max(r.start); + let end = l.end.min(r.end); + if start < end { + out.push(start..end); + } + } + } + out +} - plan_builder.with_row_selection_policy(new_policy) +fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { + let mut out: Vec> = Vec::with_capacity(ranges.len()); + for range in ranges { + if range.is_empty() { + continue; + } + if let Some(last) = out.last_mut() + && last.end == range.start + { + last.end = range.end; + continue; + } + out.push(range); + } + out } #[cfg(test)] mod tests { use super::*; + use crate::arrow::arrow_reader::{RowSelection, RowSelectionCursor, RowSelector}; + use crate::file::page_index::offset_index::PageLocation; + + #[test] + fn test_resolve_selection_policy_preserves_mask_choice() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(99), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 101 + ) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_resolve_selection_policy_preserves_selector_choice() { + let selection = RowSelection::from(vec![RowSelector::select(128)]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 128 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_resolve_selection_policy_respects_explicit_policy() { + let selection = RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]); + let mask_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection.clone())) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let selector_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Selectors); + + assert_eq!( + resolve_selection_policy_for_projection(mask_builder, &ProjectionMask::all(), None, 2) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + assert_eq!( + resolve_selection_policy_for_projection( + selector_builder, + &ProjectionMask::all(), + None, + 2 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_sparse_loaded_ranges_force_selectors() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_dense_loaded_ranges_preserve_mask() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_explicit_mask_keeps_sparse_loaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + + let mut plan = plan_builder.build(); + let RowSelectionCursor::Mask(cursor) = plan.row_selection_cursor_mut() else { + panic!("expected mask cursor"); + }; + assert!(cursor.is_sparse()); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors_without_selector_count_gate() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 52, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_cheap_fragmented_output_keeps_mask() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 8.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_auto_moderate_selectivity_expensive_output_keeps_mask() { + let selection = q26_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_200, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + fn q38_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(1)); + selectors.push(RowSelector::skip(12)); + } + RowSelection::from(selectors) + } + + fn q26_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(2)); + selectors.push(RowSelector::skip(10)); + } + RowSelection::from(selectors) + } + + fn sparse_test_offset_index() -> Vec { + vec![OffsetIndexMetaData { + page_locations: vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ], + unencoded_byte_array_data_bytes: None, + }] + } #[test] // Verify that the size of RowGroupDecoderState does not grow too large fn test_structure_size() { - assert_eq!(std::mem::size_of::(), 200); + assert_eq!(std::mem::size_of::(), 256); } } diff --git a/parquet/tests/arrow_reader/row_filter/async.rs b/parquet/tests/arrow_reader/row_filter/async.rs index 66840bb8147b..2f50bcc7ef45 100644 --- a/parquet/tests/arrow_reader/row_filter/async.rs +++ b/parquet/tests/arrow_reader/row_filter/async.rs @@ -24,8 +24,8 @@ use arrow::{ datatypes::{Int32Type, TimestampNanosecondType}, }; use arrow_array::{ - ArrayRef, BooleanArray, Int8Array, Int32Array, Int64Array, RecordBatch, Scalar, StringArray, - StructArray, + ArrayRef, BooleanArray, Int8Array, Int32Array, Int64Array, ListArray, RecordBatch, Scalar, + StringArray, StructArray, }; use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; @@ -526,19 +526,133 @@ async fn test_predicate_pushdown_with_skipped_pages() { } } -/// Regression test: when multiple predicates are used, the first predicate's -/// override of the selection strategy (to Mask) must NOT carry forward to -/// subsequent predicates. Each predicate must get a fresh Auto policy so the -/// override can detect page skipping for that predicate's specific columns. -/// -/// Scenario: -/// - Dense initial RowSelection (alternating select/skip) covers all pages → Auto resolves to Mask -/// - Predicate 1 evaluates on column A, narrows selection to skip middle pages -/// - Predicate 2's column B is fetched sparsely with the narrowed selection (missing middle pages) -/// - Without the fix, the override for predicate 2 returns early (policy=Mask, not Auto), -/// so Mask is used and tries to read missing pages → "Invalid offset" error +/// Regression test for explicit mask predicate pushdown attempting to read skipped pages. +/// Related issue: https://github.com/apache/arrow-rs/issues/9239 #[tokio::test] -async fn test_multi_predicate_mask_policy_carryover() { +async fn test_explicit_mask_predicate_pushdown_with_skipped_pages() { + use arrow_array::TimestampNanosecondArray; + use arrow_schema::TimeUnit; + + const TIME_IN_RANGE_START: i64 = 1_704_092_400_000_000_000; + const TIME_IN_RANGE_END: i64 = 1_704_110_400_000_000_000; + const TIME_BEFORE_RANGE: i64 = 1_704_078_000_000_000_000; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("tag", DataType::Utf8, false), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(300)) + .set_data_page_row_count_limit(33) + .build(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + + for _ in 0..2 { + for (tag_idx, tag) in ["a", "b", "c"].iter().enumerate() { + let times: Vec = (0..100) + .map(|j| { + let row_idx = tag_idx * 100 + j; + if row_idx % 2 == 0 { + TIME_IN_RANGE_START + (j as i64 * 1_000_000) + } else { + TIME_BEFORE_RANGE + (j as i64 * 1_000_000) + } + }) + .collect(); + let tags: Vec<&str> = (0..100).map(|_| *tag).collect(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(TimestampNanosecondArray::from(times)) as ArrayRef, + Arc::new(StringArray::from(tags)) as ArrayRef, + ], + ) + .unwrap(); + writer.write(&batch).unwrap(); + } + writer.flush().unwrap(); + } + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + for policy in [ + PageIndexPolicy::Skip, + PageIndexPolicy::Optional, + PageIndexPolicy::Required, + ] { + let reader = TestReader::new(buffer.clone()); + let options = ArrowReaderOptions::default().with_page_index_policy(policy); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let num_row_groups = builder.metadata().num_row_groups(); + + let mut selectors = Vec::new(); + for _ in 0..num_row_groups { + selectors.push(RowSelector::select(100)); + selectors.push(RowSelector::skip(100)); + selectors.push(RowSelector::select(100)); + } + let selection = RowSelection::from(selectors); + + let time_gte_predicate = + ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|t| t.map(|v| v >= TIME_IN_RANGE_START)), + )) + }); + + let time_lt_predicate = + ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|t| t.map(|v| v < TIME_IN_RANGE_END)), + )) + }); + + let row_filter = RowFilter::new(vec![ + Box::new(time_gte_predicate), + Box::new(time_lt_predicate), + ]); + let projection = ProjectionMask::roots(&schema_descr, [1]); + + let stream = builder + .with_row_filter(row_filter) + .with_row_selection(selection) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + assert_eq!(batch.num_columns(), 1); + let expected = StringArray::from_iter_values( + std::iter::repeat_n("a", 50) + .chain(std::iter::repeat_n("c", 50)) + .chain(std::iter::repeat_n("a", 50)) + .chain(std::iter::repeat_n("c", 50)), + ); + assert_eq!(batch.column(0).as_string(), &expected); + } +} + +/// Regression test: Auto falls back to selectors when an earlier predicate +/// prunes away whole pages. Explicit Mask still exercises sparse loaded ranges +/// in the tests below. +#[tokio::test] +async fn test_auto_sparse_pages_fall_back_to_selectors_across_predicates() { // 300 rows, 1 row group, 100 rows per page (3 pages) let num_rows = 300usize; let rows_per_page = 100; @@ -620,12 +734,13 @@ async fn test_multi_predicate_mask_policy_carryover() { .with_row_filter(row_filter) .with_row_selection(selection) .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }) .with_max_predicate_cache_size(0) .build() .unwrap(); - // Without the fix, this panics with: - // "Invalid offset in sparse column chunk data: ..., no matching page found." + // This exercises Auto after page pruning. Without the Auto sparse-page gate, + // the second predicate would use a sparse mask and can regress heavily. let batches: Vec = stream.try_collect().await.unwrap(); let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); @@ -633,4 +748,203 @@ async fn test_multi_predicate_mask_policy_carryover() { // That's even-indexed rows in [0,100) with value<250 → rows 0,2,4,...,98 (50 rows) // Plus even-indexed rows in [200,250) with value<250 → rows 200,202,...,248 (25 rows) assert_eq!(batch.num_rows(), 75); + assert_eq!(batch.num_columns(), 2); + + let expected_filter_col = Int32Array::from(vec![0; 75]); + assert_eq!( + batch.column(0).as_primitive::(), + &expected_filter_col + ); + + let expected_values = + Int32Array::from_iter_values((0..100).step_by(2).chain((200..250).step_by(2))); + assert_eq!( + batch.column(1).as_primitive::(), + &expected_values + ); +} + +#[tokio::test] +async fn test_explicit_mask_final_projection_with_sparse_pages() { + let num_rows = 300usize; + let rows_per_page = 100; + + let schema = Arc::new(Schema::new(vec![ + Field::new("filter_col", DataType::Int32, false), + Field::new("value_col", DataType::Int32, false), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(num_rows)) + .set_data_page_row_count_limit(rows_per_page) + .set_write_batch_size(rows_per_page) + .set_dictionary_enabled(false) + .build(); + + let filter_values: Vec = (0..num_rows as i32) + .map(|i| if (100..200).contains(&i) { 1 } else { 0 }) + .collect(); + let value_values: Vec = (0..num_rows as i32).collect(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(filter_values)) as ArrayRef, + Arc::new(Int32Array::from(value_values)) as ArrayRef, + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + let reader = TestReader::new(buffer); + let options = ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::Required); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let selectors: Vec = (0..num_rows / 2) + .flat_map(|_| vec![RowSelector::select(1), RowSelector::skip(1)]) + .collect(); + let selection = RowSelection::from(selectors); + + let pred1 = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val == 0)), + )) + }); + + let pred2 = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [1]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val < 250)), + )) + }); + + let row_filter = RowFilter::new(vec![Box::new(pred1), Box::new(pred2)]); + let projection = ProjectionMask::roots(&schema_descr, [0, 1]); + + let stream = builder + .with_row_filter(row_filter) + .with_row_selection(selection) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .with_max_predicate_cache_size(0) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + + assert_eq!(batch.num_rows(), 75); + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.schema().field(0).name(), "filter_col"); + assert_eq!(batch.schema().field(1).name(), "value_col"); + + let expected_filter_col = Int32Array::from(vec![0; 75]); + assert_eq!( + batch.column(0).as_primitive::(), + &expected_filter_col + ); + + let expected_values = + Int32Array::from_iter_values((0..100).step_by(2).chain((200..250).step_by(2))); + assert_eq!( + batch.column(1).as_primitive::(), + &expected_values + ); +} + +#[tokio::test] +async fn test_explicit_mask_list_projection_with_sparse_pages() { + let num_rows = 300usize; + let rows_per_page = 100; + + let schema = Arc::new(Schema::new(vec![ + Field::new("filter_col", DataType::Int32, false), + Field::new( + "list_col", + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + true, + ), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(num_rows)) + .set_data_page_row_count_limit(rows_per_page) + .set_write_batch_size(rows_per_page) + .set_dictionary_enabled(false) + .build(); + + let filter_values: Vec = (0..num_rows as i32) + .map(|i| if (100..200).contains(&i) { 1 } else { 0 }) + .collect(); + let list_values = ListArray::from_iter_primitive::( + (0..num_rows as i32).map(|i| Some(vec![Some(i), Some(i + 1000)])), + ); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(filter_values)) as ArrayRef, + Arc::new(list_values) as ArrayRef, + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + let reader = TestReader::new(buffer); + let options = ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::Required); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let pred = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val == 0)), + )) + }); + let row_filter = RowFilter::new(vec![Box::new(pred)]); + let projection = ProjectionMask::roots(&schema_descr, [1]); + + let stream = builder + .with_row_filter(row_filter) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + + assert_eq!(batch.num_rows(), 200); + assert_eq!(batch.num_columns(), 1); + + let expected_indices = (0..100).chain(200..300); + let expected = ListArray::from_iter_primitive::( + expected_indices.map(|i| Some(vec![Some(i), Some(i + 1000)])), + ); + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &expected + ); } From 5592f857806ec716176515171975d67340b33e89 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 17:39:23 +0800 Subject: [PATCH 02/32] fix(parquet): address auto fallback review issues --- parquet/src/arrow/arrow_reader/selection.rs | 59 +++++--- parquet/src/arrow/push_decoder/mod.rs | 138 ++++++++++++------ .../arrow/push_decoder/reader_builder/data.rs | 4 - .../arrow/push_decoder/reader_builder/mod.rs | 24 ++- parquet/src/arrow/push_decoder/remaining.rs | 6 + 5 files changed, 157 insertions(+), 74 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index db4dc55c25d9..8f8f723da0c2 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -490,28 +490,47 @@ impl RowSelection { page_locations: &[PageLocation], total_rows: usize, ) -> Vec> { - let selected_byte_ranges = self.scan_ranges(page_locations); - page_locations - .iter() - .enumerate() - .filter_map(|(idx, page)| { - let start = page.offset as u64; - let end = start + page.compressed_page_size as u64; - let page_is_selected = selected_byte_ranges - .iter() - .any(|range| range.start == start && range.end == end); - if !page_is_selected { - return None; + let mut ranges = Vec::new(); + let mut selector_idx = 0; + let mut selector_start = 0usize; + + for (page_idx, page) in page_locations.iter().enumerate() { + let page_start = page.first_row_index as usize; + let page_end = page_locations + .get(page_idx + 1) + .map(|next| next.first_row_index as usize) + .unwrap_or(total_rows); + + while selector_idx < self.selectors.len() { + let selector_end = selector_start + self.selectors[selector_idx].row_count; + if selector_end > page_start { + break; } + selector_start = selector_end; + selector_idx += 1; + } - let row_start = page.first_row_index as usize; - let row_end = page_locations - .get(idx + 1) - .map(|next| next.first_row_index as usize) - .unwrap_or(total_rows); - Some(row_start..row_end) - }) - .collect() + let mut scan_idx = selector_idx; + let mut scan_start = selector_start; + let mut page_is_selected = false; + + while scan_idx < self.selectors.len() && scan_start < page_end { + let selector = self.selectors[scan_idx]; + let selector_end = scan_start + selector.row_count; + if !selector.skip && selector_end > page_start { + page_is_selected = true; + break; + } + scan_start = selector_end; + scan_idx += 1; + } + + if page_is_selected { + ranges.push(page_start..page_end); + } + } + + ranges } /// Splits off the first `row_count` from this [`RowSelection`] diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 8b358ad61ad8..e4634c49667d 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -401,6 +401,7 @@ impl ParquetDecoderState { ) -> Result<(Self, DecodeResult), ParquetError> { let mut current_state = self; loop { + current_state.disable_post_filter_fallback(); let (next_state, decode_result) = current_state.transition()?; // if more data is needed to transition, can't proceed further without it match decode_result { @@ -418,6 +419,11 @@ impl ParquetDecoderState { mut record_batch_reader, remaining_row_groups, } => { + // The reader API can advance to future row groups before + // the returned reader is consumed. Disable post-filter + // fallback before building row groups for this API; this + // materialization remains only as a guard for mixed API use + // where a post-filter reader was already active. record_batch_reader.materialize_post_filter()?; let result = DecodeResult::Data(*record_batch_reader); let next_state = Self::ReadingRowGroup { @@ -432,6 +438,15 @@ impl ParquetDecoderState { } } + fn disable_post_filter_fallback(&mut self) { + if let Self::ReadingRowGroup { + remaining_row_groups, + } = self + { + remaining_row_groups.disable_post_filter_fallback(); + } + } + /// Current state --> next state + output /// /// This function is called to get the next RecordBatch @@ -1090,43 +1105,27 @@ mod test { .build() .unwrap(); - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!(predicate_rows.load(Ordering::Relaxed), 100); - assert_eq!( - reader.next().unwrap().unwrap(), - TEST_BATCH.slice(0, 100).project(&[2]).unwrap() - ); - assert!(reader.next().is_none()); + assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[2]).unwrap()); - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!(predicate_rows.load(Ordering::Relaxed), 200); - assert_eq!( - reader.next().unwrap().unwrap(), - TEST_BATCH.slice(100, 100).project(&[2]).unwrap() - ); - assert!(reader.next().is_none()); + assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!( predicate_rows.load(Ordering::Relaxed), 300, - "fallback should evaluate predicates before returning the reader to preserve row-group order" - ); - assert_eq!( - reader.next().unwrap().unwrap(), - TEST_BATCH.slice(200, 100).project(&[2]).unwrap() + "fallback should evaluate predicates while producing the current row group" ); + assert_eq!(batch, TEST_BATCH.slice(200, 100).project(&[2]).unwrap()); assert_eq!(predicate_rows.load(Ordering::Relaxed), 300); - assert!(reader.next().is_none()); - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); - assert_eq!( - reader.next().unwrap().unwrap(), - TEST_BATCH.slice(300, 100).project(&[2]).unwrap() - ); + assert_eq!(batch, TEST_BATCH.slice(300, 100).project(&[2]).unwrap()); assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); - assert!(reader.next().is_none()); assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); @@ -1135,6 +1134,48 @@ mod test { metrics.fallback_high_selectivity_no_pruning_count(), Some(1) ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_try_next_reader_skips_post_filter_fallback() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[2]) + .unwrap() + ); + assert!(reader.next().is_none()); + } + + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(0)); assert!(next_reader_with_data(&mut decoder, data).is_none()); } @@ -1166,34 +1207,32 @@ mod test { .unwrap(); for row_group_idx in 0..2 { - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100 ); assert_eq!( - reader.next().unwrap().unwrap(), + batch, expected_c_not_multiple_of_three(row_group_idx * 100, 100) ); - assert!(reader.next().is_none()); } for row_group_idx in 2..4 { - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100, - "fallback should evaluate predicates before returning the reader to preserve row-group order" + "fallback should evaluate predicates while producing the current row group" ); assert_eq!( - reader.next().unwrap().unwrap(), + batch, expected_c_not_multiple_of_three(row_group_idx * 100, 100) ); assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100 ); - assert!(reader.next().is_none()); } assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); @@ -1203,7 +1242,7 @@ mod test { metrics.fallback_fragmented_high_selectivity_materialization_count(), Some(1) ); - assert!(next_reader_with_data(&mut decoder, data).is_none()); + assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] @@ -1234,20 +1273,16 @@ mod test { .unwrap(); for row_group_idx in 0..4 { - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100 ); - assert_eq!( - reader.next().unwrap().unwrap(), - expected_c_multiple_of_ten(row_group_idx * 100, 100) - ); + assert_eq!(batch, expected_c_multiple_of_ten(row_group_idx * 100, 100)); assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100 ); - assert!(reader.next().is_none()); } assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); @@ -1257,7 +1292,7 @@ mod test { metrics.fallback_fragmented_moderate_selectivity_count(), Some(1) ); - assert!(next_reader_with_data(&mut decoder, data).is_none()); + assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] @@ -1286,12 +1321,8 @@ mod test { .build() .unwrap(); - let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); - assert_eq!( - reader.next().unwrap().unwrap(), - TEST_BATCH.slice(0, 100).project(&[0, 2]).unwrap() - ); - assert!(reader.next().is_none()); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[0, 2]).unwrap()); assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(1)); @@ -1916,6 +1947,21 @@ mod test { } } + fn next_batch_with_data(decoder: &mut ParquetPushDecoder, data: &Bytes) -> Option { + loop { + match decoder + .try_decode() + .expect("decoder should produce a batch or request data") + { + DecodeResult::NeedsData(ranges) => { + push_ranges_to_decoder_with_data(decoder, ranges, data); + } + DecodeResult::Data(batch) => return Some(batch), + DecodeResult::Finished => return None, + } + } + } + /// Expect that the [`DecodeResult`] is a [`DecodeResult::Data`] and return the corresponding element fn expect_data(result: Result, ParquetError>) -> T { match result.expect("Expected Ok(DecodeResult::Data(T))") { diff --git a/parquet/src/arrow/push_decoder/reader_builder/data.rs b/parquet/src/arrow/push_decoder/reader_builder/data.rs index af7cb306aa2c..6fbc2090b06e 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/data.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/data.rs @@ -54,10 +54,6 @@ impl DataRequest { .collect() } - pub(super) fn into_column_chunks(self) -> Vec>> { - self.column_chunks - } - /// Returns the chunks from the buffers that satisfy this request fn get_chunks(&self, buffers: &PushBuffers) -> Result, ParquetError> { self.ranges diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 127c09d2d4f5..a83a48af79ba 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -204,6 +204,9 @@ pub(crate) struct RowGroupReaderBuilder { #[allow(dead_code)] fallback_state: RowGroupFallbackState, + /// Whether this builder may switch Auto policy to post-filter fallback. + post_filter_fallback_enabled: bool, + /// Current state of the decoder. /// /// It is taken when processing, and must be put back before returning @@ -243,6 +246,7 @@ impl RowGroupReaderBuilder { max_predicate_cache_size, row_selection_policy, fallback_state: RowGroupFallbackState::default(), + post_filter_fallback_enabled: true, state: Some(RowGroupDecoderState::Finished), buffers, } @@ -263,6 +267,12 @@ impl RowGroupReaderBuilder { self.buffers.clear_all_ranges(); } + /// Disable post-filter fallback for APIs that hand row-group readers back to + /// callers before they are consumed. + pub(crate) fn disable_post_filter_fallback(&mut self) { + self.post_filter_fallback_enabled = false; + } + /// take the current state, leaving None in its place. /// /// Returns an error if there the state wasn't put back after the previous @@ -812,12 +822,16 @@ impl RowGroupReaderBuilder { row_group_info.row_count, )]) }); - let column_chunks = data_request.into_column_chunks(); + // The in-flight request may contain chunks loaded + // for a sparse predicate selection. The fallback + // below rebuilds a base/full-selection read plan, + // so do not reuse chunks whose page coverage no + // longer matches the requested rows. return self.start_post_selection_filter( row_group_info, selection, cache_info, - Some(column_chunks), + None, ); } @@ -1017,7 +1031,8 @@ impl RowGroupReaderBuilder { matches!( self.fallback_state, RowGroupFallbackState::UsePostFilter { .. } - ) && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + ) && self.post_filter_fallback_enabled + && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && self.limit.is_none() && self.offset.is_none() && !self.has_virtual_columns() @@ -1097,7 +1112,8 @@ impl RowGroupReaderBuilder { let Some(filter) = self.filter.as_ref() else { return false; }; - matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + self.post_filter_fallback_enabled + && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && self.limit.is_none() && self.offset.is_none() && !self.has_virtual_columns() diff --git a/parquet/src/arrow/push_decoder/remaining.rs b/parquet/src/arrow/push_decoder/remaining.rs index 2986ca0da8d8..09a87e763121 100644 --- a/parquet/src/arrow/push_decoder/remaining.rs +++ b/parquet/src/arrow/push_decoder/remaining.rs @@ -75,6 +75,12 @@ impl RemainingRowGroups { self.row_group_reader_builder.clear_all_ranges(); } + /// Prevent Auto selection from switching to post-filter fallback for reader + /// handoff APIs. + pub(crate) fn disable_post_filter_fallback(&mut self) { + self.row_group_reader_builder.disable_post_filter_fallback(); + } + /// returns [`ParquetRecordBatchReader`] suitable for reading the next /// group of rows from the Parquet data, or the list of data ranges still /// needed to proceed From 22b291188c3b4e1b367c19b03665bd9f091e5af1 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 17:58:57 +0800 Subject: [PATCH 03/32] fix(parquet): harden auto fallback review fixes --- parquet/src/arrow/arrow_reader/metrics.rs | 49 ++--------- parquet/src/arrow/arrow_reader/read_plan.rs | 37 +------- parquet/src/arrow/arrow_reader/selection.rs | 30 +------ parquet/src/arrow/push_decoder/mod.rs | 49 ++++++++++- .../arrow/push_decoder/reader_builder/mod.rs | 87 +++++++++++++++---- 5 files changed, 132 insertions(+), 120 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index 506c9b738970..72ac792beb3a 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -205,24 +205,9 @@ impl ArrowReaderMetrics { self.load(|inner| &inner.fallback_fragmented_moderate_selectivity_count) } - /// Fallback: number of fragmented high-selectivity materialization triggers - pub fn fallback_fragmented_high_selectivity_materialization_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_high_selectivity_materialization_count) - } - - /// Fallback: number of fragmented high-selectivity output-dominates triggers - pub fn fallback_fragmented_high_selectivity_output_dominates_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_high_selectivity_output_dominates_count) - } - - /// Fallback: number of fragmented high-selectivity cache-miss triggers - pub fn fallback_fragmented_high_selectivity_cache_miss_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_high_selectivity_cache_miss_count) - } - - /// Fallback: number of fragmented high-selectivity cache-rejected triggers - pub fn fallback_fragmented_high_selectivity_cache_rejected_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_high_selectivity_cache_rejected_count) + /// Fallback: number of fragmented high-selectivity triggers + pub fn fallback_fragmented_high_selectivity_count(&self) -> Option { + self.load(|inner| &inner.fallback_fragmented_high_selectivity_count) } /// Increments the count of records read from the inner reader @@ -335,17 +320,8 @@ impl ArrowReaderMetrics { FallbackTriggerReason::FragmentedModerateSelectivity => { &inner.fallback_fragmented_moderate_selectivity_count } - FallbackTriggerReason::FragmentedHighSelectivityMaterialization => { - &inner.fallback_fragmented_high_selectivity_materialization_count - } - FallbackTriggerReason::FragmentedHighSelectivityOutputDominates => { - &inner.fallback_fragmented_high_selectivity_output_dominates_count - } - FallbackTriggerReason::FragmentedHighSelectivityCacheMiss => { - &inner.fallback_fragmented_high_selectivity_cache_miss_count - } - FallbackTriggerReason::FragmentedHighSelectivityCacheRejected => { - &inner.fallback_fragmented_high_selectivity_cache_rejected_count + FallbackTriggerReason::FragmentedHighSelectivity => { + &inner.fallback_fragmented_high_selectivity_count } FallbackTriggerReason::ObservationIncomplete => { &inner.fallback_observation_incomplete_count @@ -422,14 +398,8 @@ pub struct ArrowReaderMetricsInner { fallback_high_selectivity_no_pruning_count: AtomicUsize, /// Number of fragmented moderate-selectivity fallback triggers fallback_fragmented_moderate_selectivity_count: AtomicUsize, - /// Number of fragmented high-selectivity materialization fallback triggers - fallback_fragmented_high_selectivity_materialization_count: AtomicUsize, - /// Number of fragmented high-selectivity output-dominates fallback triggers - fallback_fragmented_high_selectivity_output_dominates_count: AtomicUsize, - /// Number of fragmented high-selectivity cache-miss fallback triggers - fallback_fragmented_high_selectivity_cache_miss_count: AtomicUsize, - /// Number of fragmented high-selectivity cache-rejected fallback triggers - fallback_fragmented_high_selectivity_cache_rejected_count: AtomicUsize, + /// Number of fragmented high-selectivity fallback triggers + fallback_fragmented_high_selectivity_count: AtomicUsize, } impl ArrowReaderMetricsInner { @@ -461,10 +431,7 @@ impl ArrowReaderMetricsInner { fallback_pushdown_still_preferred_count: AtomicUsize::new(0), fallback_high_selectivity_no_pruning_count: AtomicUsize::new(0), fallback_fragmented_moderate_selectivity_count: AtomicUsize::new(0), - fallback_fragmented_high_selectivity_materialization_count: AtomicUsize::new(0), - fallback_fragmented_high_selectivity_output_dominates_count: AtomicUsize::new(0), - fallback_fragmented_high_selectivity_cache_miss_count: AtomicUsize::new(0), - fallback_fragmented_high_selectivity_cache_rejected_count: AtomicUsize::new(0), + fallback_fragmented_high_selectivity_count: AtomicUsize::new(0), } } } diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index dba336fe0eed..7226a1457124 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -669,16 +669,11 @@ mod tests { selected_run_count: 64, skipped_run_count: 32, }, - predicate_evaluate_nanos: 10, - output_read_nanos: 20, - output_materialize_nanos: 50, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::FragmentedHighSelectivityMaterialization + FallbackTriggerReason::FragmentedHighSelectivity ); } @@ -697,11 +692,6 @@ mod tests { selected_run_count: 32, skipped_run_count: 32, }, - predicate_evaluate_nanos: 10, - output_read_nanos: 20, - output_materialize_nanos: 50, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( @@ -725,11 +715,6 @@ mod tests { selected_run_count: 2, skipped_run_count: 0, }, - predicate_evaluate_nanos: 0, - output_read_nanos: 0, - output_materialize_nanos: 0, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( @@ -753,11 +738,6 @@ mod tests { selected_run_count: 30, skipped_run_count: 30, }, - predicate_evaluate_nanos: 0, - output_read_nanos: 0, - output_materialize_nanos: 0, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( @@ -781,11 +761,6 @@ mod tests { selected_run_count: 9, skipped_run_count: 9, }, - predicate_evaluate_nanos: 0, - output_read_nanos: 0, - output_materialize_nanos: 0, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( @@ -809,11 +784,6 @@ mod tests { selected_run_count: 3_084, skipped_run_count: 3_084, }, - predicate_evaluate_nanos: 0, - output_read_nanos: 0, - output_materialize_nanos: 0, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( @@ -837,11 +807,6 @@ mod tests { selected_run_count: 4, skipped_run_count: 4, }, - predicate_evaluate_nanos: 10, - output_read_nanos: 20, - output_materialize_nanos: 50, - cache_miss_count: 0, - cache_insert_rejected_count: 0, }; assert_eq!( diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 8f8f723da0c2..c09633a3e81c 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -177,10 +177,7 @@ impl RowSelectionShape { pub(crate) enum FallbackTriggerReason { HighSelectivityNoPruning, FragmentedModerateSelectivity, - FragmentedHighSelectivityMaterialization, - FragmentedHighSelectivityOutputDominates, - FragmentedHighSelectivityCacheMiss, - FragmentedHighSelectivityCacheRejected, + FragmentedHighSelectivity, ObservationIncomplete, PushdownStillPreferred, ForcedPolicy, @@ -190,11 +187,6 @@ pub(crate) enum FallbackTriggerReason { pub(crate) struct FallbackObservation { pub(crate) observed_row_groups: usize, pub(crate) shape: RowSelectionShape, - pub(crate) predicate_evaluate_nanos: usize, - pub(crate) output_read_nanos: usize, - pub(crate) output_materialize_nanos: usize, - pub(crate) cache_miss_count: usize, - pub(crate) cache_insert_rejected_count: usize, } impl FallbackObservation { @@ -227,20 +219,7 @@ impl FallbackObservation { return FallbackTriggerReason::PushdownStillPreferred; } - if self.output_materialize_nanos >= self.predicate_evaluate_nanos { - return FallbackTriggerReason::FragmentedHighSelectivityMaterialization; - } - if self.output_materialize_nanos > self.output_read_nanos { - return FallbackTriggerReason::FragmentedHighSelectivityOutputDominates; - } - if self.cache_miss_count > 0 { - return FallbackTriggerReason::FragmentedHighSelectivityCacheMiss; - } - if self.cache_insert_rejected_count > 0 { - return FallbackTriggerReason::FragmentedHighSelectivityCacheRejected; - } - - FallbackTriggerReason::PushdownStillPreferred + FallbackTriggerReason::FragmentedHighSelectivity } #[allow(dead_code)] @@ -249,10 +228,7 @@ impl FallbackObservation { self.trigger_reason(), FallbackTriggerReason::HighSelectivityNoPruning | FallbackTriggerReason::FragmentedModerateSelectivity - | FallbackTriggerReason::FragmentedHighSelectivityMaterialization - | FallbackTriggerReason::FragmentedHighSelectivityOutputDominates - | FallbackTriggerReason::FragmentedHighSelectivityCacheMiss - | FallbackTriggerReason::FragmentedHighSelectivityCacheRejected + | FallbackTriggerReason::FragmentedHighSelectivity ) } } diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index e4634c49667d..8ed21f83e8bb 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1239,7 +1239,7 @@ mod test { assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); assert_eq!( - metrics.fallback_fragmented_high_selectivity_materialization_count(), + metrics.fallback_fragmented_high_selectivity_count(), Some(1) ); assert!(next_batch_with_data(&mut decoder, data).is_none()); @@ -1329,6 +1329,53 @@ mod test { assert_eq!(metrics.records_read_from_cache(), Some(100)); } + #[test] + fn test_decoder_auto_fallback_with_row_selection_does_not_evaluate_current_row_group_twice() { + let data = &FALLBACK_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection(RowSelection::from(vec![RowSelector::select(400)])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + 100, + "fallback observation must not re-run the predicate for the same row group" + ); + assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[2]).unwrap()); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 200); + assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); + + assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(1)); + } + #[test] fn test_decoder_empty_filters() { let builder = diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index a83a48af79ba..df8caab3e002 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -835,14 +835,15 @@ impl RowGroupReaderBuilder { ); } - let filter = self.filter.take().ok_or_else(|| { - ParquetError::General( - "post-filter fallback selected without a row filter".to_string(), - ) - })?; - let filter = Arc::new(Mutex::new(filter)); - self.post_filter = Some(Arc::clone(&filter)); - return self.start_post_filter(row_group_info, filter); + if self.post_filter.is_none() { + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General( + "post-filter fallback selected without a row filter" + .to_string(), + ) + })?; + self.post_filter = Some(Arc::new(Mutex::new(filter))); + } } self.metrics @@ -1309,15 +1310,26 @@ fn loaded_ranges_for_projection( fn intersect_ranges(left: Vec>, right: Vec>) -> Vec> { let mut out = Vec::new(); - for l in &left { - for r in &right { - let start = l.start.max(r.start); - let end = l.end.min(r.end); - if start < end { - out.push(start..end); - } + let mut left_idx = 0; + let mut right_idx = 0; + + while left_idx < left.len() && right_idx < right.len() { + let l = &left[left_idx]; + let r = &right[right_idx]; + let start = l.start.max(r.start); + let end = l.end.min(r.end); + + if start < end { + out.push(start..end); + } + + if l.end <= r.end { + left_idx += 1; + } else { + right_idx += 1; } } + out } @@ -1497,6 +1509,36 @@ mod tests { assert!(cursor.is_sparse()); } + #[test] + fn test_loaded_ranges_intersects_many_ranges_across_projected_columns() { + let selection = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(9), + ]); + let offset_index = vec![ + offset_index_column(&[0, 20, 40, 60, 80]), + offset_index_column(&[0, 15, 35, 55, 75]), + offset_index_column(&[0, 10, 30, 50, 70, 90]), + ]; + + let loaded = loaded_ranges_for_projection( + Some(&selection), + &ProjectionMask::all(), + Some(&offset_index), + 100, + ); + + assert_eq!( + loaded, + Some(LoadedRowRanges::new(vec![10..15, 50..55, 90..100], 100)) + ); + } + #[test] fn test_auto_expensive_fragmented_output_prefers_selectors() { let selection = q38_like_fragmented_selection(); @@ -1647,6 +1689,21 @@ mod tests { }] } + fn offset_index_column(first_rows: &[i64]) -> OffsetIndexMetaData { + OffsetIndexMetaData { + page_locations: first_rows + .iter() + .enumerate() + .map(|(idx, first_row_index)| PageLocation { + offset: (idx * 10) as i64, + compressed_page_size: 10, + first_row_index: *first_row_index, + }) + .collect(), + unencoded_byte_array_data_bytes: None, + } + } + #[test] // Verify that the size of RowGroupDecoderState does not grow too large fn test_structure_size() { From 713979e3be45c42c4dd327c08ce039c06ff4cc27 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 19:11:50 +0800 Subject: [PATCH 04/32] fix(parquet): address CI failures --- parquet/benches/arrow_reader_row_filter.rs | 6 +- parquet/src/arrow/arrow_reader/read_plan.rs | 56 ++++++++----------- parquet/src/arrow/arrow_reader/selection.rs | 12 ++-- parquet/src/arrow/push_decoder/mod.rs | 2 +- .../arrow/push_decoder/reader_builder/data.rs | 12 ++++ .../arrow/push_decoder/reader_builder/mod.rs | 21 ++++--- 6 files changed, 51 insertions(+), 58 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 49cb1d92349e..b20781d82f2b 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -782,11 +782,7 @@ fn projection_names(projection: &[usize]) -> Vec<&'static str> { } fn row_filter_for(filter_type: FilterType, pred_mask: ProjectionMask) -> RowFilter { - let filter = - ArrowPredicateFn::new( - pred_mask, - move |batch| Ok(filter_type.filter_batch(&batch)?), - ); + let filter = ArrowPredicateFn::new(pred_mask, move |batch| filter_type.filter_batch(&batch)); RowFilter::new(vec![Box::new(filter)]) } diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index 7226a1457124..a54daa58f285 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -623,20 +623,28 @@ mod tests { builder: ReadPlanBuilder, strategy: RowSelectionStrategy, reason: RowSelectionStrategyReason, + expected_shape: RowSelectionShape, + ) { + let decision = builder.resolve_selection_strategy_decision(); + assert_eq!(decision.strategy, strategy); + assert_eq!(decision.reason, reason); + assert_eq!(decision.shape, expected_shape); + } + + fn shape( selected_rows: usize, skipped_rows: usize, selector_count: usize, selected_run_count: usize, skipped_run_count: usize, - ) { - let decision = builder.resolve_selection_strategy_decision(); - assert_eq!(decision.strategy, strategy); - assert_eq!(decision.reason, reason); - assert_eq!(decision.shape.selected_rows, selected_rows); - assert_eq!(decision.shape.skipped_rows, skipped_rows); - assert_eq!(decision.shape.selector_count, selector_count); - assert_eq!(decision.shape.selected_run_count, selected_run_count); - assert_eq!(decision.shape.skipped_run_count, skipped_run_count); + ) -> RowSelectionShape { + RowSelectionShape { + selected_rows, + skipped_rows, + selector_count, + selected_run_count, + skipped_run_count, + } } #[test] @@ -825,11 +833,7 @@ mod tests { builder, RowSelectionStrategy::Mask, RowSelectionStrategyReason::ForcedMask, - 8, - 2, - 2, - 1, - 1, + shape(8, 2, 2, 1, 1), ); } @@ -843,11 +847,7 @@ mod tests { builder, RowSelectionStrategy::Selectors, RowSelectionStrategyReason::ForcedSelectors, - 8, - 2, - 2, - 1, - 1, + shape(8, 2, 2, 1, 1), ); } @@ -860,11 +860,7 @@ mod tests { builder, RowSelectionStrategy::Mask, RowSelectionStrategyReason::AutoMaskEmptySelection, - 0, - 0, - 0, - 0, - 0, + shape(0, 0, 0, 0, 0), ); } @@ -877,11 +873,7 @@ mod tests { builder, RowSelectionStrategy::Mask, RowSelectionStrategyReason::AutoMaskShortRuns, - 8, - 8, - 2, - 1, - 1, + shape(8, 8, 2, 1, 1), ); } @@ -895,11 +887,7 @@ mod tests { builder, RowSelectionStrategy::Selectors, RowSelectionStrategyReason::AutoSelectorLongRuns, - 3, - 3, - 2, - 1, - 1, + shape(3, 3, 2, 1, 1), ); } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index c09633a3e81c..de3c453d2d3d 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -210,9 +210,7 @@ impl FallbackObservation { } let selected_ratio = shape.selected_ratio(); - if selected_ratio >= Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO - && selected_ratio < 0.50 - { + if (Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { return FallbackTriggerReason::FragmentedModerateSelectivity; } if selected_ratio < 0.50 { @@ -1293,7 +1291,7 @@ impl SparseMaskCursor { /// [`ReadPlan`](crate::arrow::arrow_reader::ReadPlan). /// /// This keeps per-reader state such as the current position and delegates dense -/// or sparse mask state to [`MaskCursor`]. +/// or sparse mask state to the mask cursor. #[derive(Debug)] pub enum RowSelectionCursor { /// Reading all rows @@ -1355,10 +1353,10 @@ mod tests { #[test] fn test_loaded_row_ranges_detects_sparse_ranges() { - assert!(!LoadedRowRanges::new(vec![0..6], 6).is_sparse()); + assert!(!LoadedRowRanges::new(std::iter::once(0..6).collect(), 6).is_sparse()); assert!(!LoadedRowRanges::new(vec![], 0).is_sparse()); assert!(LoadedRowRanges::new(vec![0..2, 4..6], 6).is_sparse()); - assert!(LoadedRowRanges::new(vec![1..6], 6).is_sparse()); + assert!(LoadedRowRanges::new(std::iter::once(1..6).collect(), 6).is_sparse()); } #[test] @@ -1397,7 +1395,7 @@ mod tests { fn test_sparse_mask_cursor_errors_selected_rows_after_loaded_ranges() { let selection = RowSelection::from(vec![RowSelector::skip(5), RowSelector::select(1)]); - let loaded = LoadedRowRanges::new(vec![0..2], 6); + let loaded = LoadedRowRanges::new(std::iter::once(0..2).collect(), 6); let selectors: Vec = selection.into(); let mut cursor = SparseMaskCursor::new(selectors, loaded); diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 8ed21f83e8bb..072f4652743e 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1906,7 +1906,7 @@ mod test { let mut metadata_decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap(); push_ranges_to_metadata_decoder_with_data( &mut metadata_decoder, - vec![0..data.len() as u64], + std::iter::once(0..data.len() as u64).collect(), data, ); let metadata = metadata_decoder.try_decode().unwrap(); diff --git a/parquet/src/arrow/push_decoder/reader_builder/data.rs b/parquet/src/arrow/push_decoder/reader_builder/data.rs index 6fbc2090b06e..498f47981864 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/data.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/data.rs @@ -110,6 +110,18 @@ impl DataRequest { Ok(in_memory_row_group) } + + /// Return previously loaded column chunks if they are all dense. + /// + /// Sparse chunks may only contain pages for the predicate selection and are + /// unsafe to reuse for a fallback read over the base selection. + pub fn into_dense_column_chunks(self) -> Option>>> { + self.column_chunks + .iter() + .flatten() + .all(|chunk| matches!(chunk.as_ref(), ColumnChunkData::Dense { .. })) + .then_some(self.column_chunks) + } } /// Builder for [`DataRequest`] diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index df8caab3e002..5acce23d0ac9 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -822,16 +822,15 @@ impl RowGroupReaderBuilder { row_group_info.row_count, )]) }); - // The in-flight request may contain chunks loaded - // for a sparse predicate selection. The fallback - // below rebuilds a base/full-selection read plan, - // so do not reuse chunks whose page coverage no - // longer matches the requested rows. + let column_chunks = data_request.into_dense_column_chunks(); + // Sparse predicate chunks may not cover the base + // selection. Dense chunks are safe to reuse and + // preserve predicate-cache IO behavior. return self.start_post_selection_filter( row_group_info, selection, cache_info, - None, + column_chunks, ); } @@ -1339,11 +1338,11 @@ fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { if range.is_empty() { continue; } - if let Some(last) = out.last_mut() - && last.end == range.start - { - last.end = range.end; - continue; + if let Some(last) = out.last_mut() { + if last.end == range.start { + last.end = range.end; + continue; + } } out.push(range); } From a5f3a176b1133c8e14b95bb88f28ad2599fddcf8 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 22:18:03 +0800 Subject: [PATCH 05/32] refactor(parquet): split row filter fallback helpers --- parquet/src/arrow/arrow_reader/mod.rs | 189 +---------- parquet/src/arrow/arrow_reader/post_filter.rs | 212 ++++++++++++ .../push_decoder/reader_builder/fallback.rs | 154 +++++++++ .../arrow/push_decoder/reader_builder/mod.rs | 319 +----------------- .../reader_builder/selection_policy.rs | 202 +++++++++++ 5 files changed, 585 insertions(+), 491 deletions(-) create mode 100644 parquet/src/arrow/arrow_reader/post_filter.rs create mode 100644 parquet/src/arrow/push_decoder/reader_builder/fallback.rs create mode 100644 parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 4b1a042a828a..9877417f0884 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -18,12 +18,12 @@ //! Contains reader which reads parquet data into arrow [`RecordBatch`] use arrow_array::cast::AsArray; -use arrow_array::{Array, BooleanArray, RecordBatch, RecordBatchReader}; -use arrow_buffer::BooleanBuffer; +use arrow_array::{Array, RecordBatch, RecordBatchReader}; use arrow_schema::{ArrowError, DataType as ArrowType, FieldRef, Schema, SchemaRef}; use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; +use post_filter::{PostFilterState, PostSelectionFilterState}; pub use selection::{RowSelection, RowSelectionCursor, RowSelectionPolicy, RowSelector}; use std::collections::VecDeque; use std::fmt::{Debug, Formatter}; @@ -56,6 +56,7 @@ pub use read_plan::{PredicateOptions, ReadPlan, ReadPlanBuilder}; mod filter; pub mod metrics; +mod post_filter; mod read_plan; pub(crate) mod selection; pub mod statistics; @@ -1351,190 +1352,6 @@ pub struct ParquetRecordBatchReader { buffered_batches: Option>, } -#[derive(Debug)] -struct PostFilterState { - filter: Arc>, - predicate_projection_indices: Vec>, - predicate_projection_schemas: Vec, - output_projection_indices: Vec, - output_schema: SchemaRef, -} - -impl PostFilterState { - fn try_new( - filter: Arc>, - parquet_schema: &SchemaDescriptor, - read_schema: &Schema, - read_projection: &ProjectionMask, - output_projection: &ProjectionMask, - ) -> Result { - let filter_guard = filter.lock().map_err(|_| { - ParquetError::General("post-filter predicate state was poisoned".to_string()) - })?; - - let predicate_projection_indices = filter_guard - .predicates - .iter() - .map(|predicate| { - projection_indices(parquet_schema, read_projection, predicate.projection()) - }) - .collect::>>()?; - drop(filter_guard); - - let predicate_projection_schemas = predicate_projection_indices - .iter() - .map(|indices| read_schema.project(indices).map(SchemaRef::new)) - .collect::, _>>()?; - - let output_projection_indices = - projection_indices(parquet_schema, read_projection, output_projection)?; - let output_schema = SchemaRef::new(read_schema.project(&output_projection_indices)?); - - Ok(Self { - filter, - predicate_projection_indices, - predicate_projection_schemas, - output_projection_indices, - output_schema, - }) - } - - fn apply(&mut self, mut batch: RecordBatch) -> Result { - let mut filter = self.filter.lock().map_err(|_| { - ParquetError::General("post-filter predicate state was poisoned".to_string()) - })?; - - for (predicate_idx, (predicate, projection_indices)) in filter - .predicates - .iter_mut() - .zip(self.predicate_projection_indices.iter()) - .enumerate() - { - let input_rows = batch.num_rows(); - let predicate_batch = project_record_batch( - &batch, - projection_indices, - Arc::clone(&self.predicate_projection_schemas[predicate_idx]), - )?; - let predicate_filter = predicate.evaluate(predicate_batch)?; - - if predicate_filter.len() != input_rows { - return Err(general_err!( - "ArrowPredicate predicate returned {} rows, expected {input_rows}", - predicate_filter.len() - )); - } - - batch = filter_record_batch(&batch, &predicate_filter)?; - if batch.num_rows() == 0 { - break; - } - } - - Ok(project_record_batch( - &batch, - &self.output_projection_indices, - Arc::clone(&self.output_schema), - )?) - } -} - -#[inline(always)] -fn project_record_batch( - batch: &RecordBatch, - indices: &[usize], - schema: SchemaRef, -) -> std::result::Result { - if indices.len() == batch.num_columns() && indices.iter().copied().eq(0..batch.num_columns()) { - debug_assert_eq!(batch.schema_ref().as_ref(), schema.as_ref()); - return Ok(batch.clone()); - } - - let columns = indices - .iter() - .map(|idx| { - batch.columns().get(*idx).cloned().ok_or_else(|| { - ArrowError::SchemaError(format!( - "project index {} out of bounds, max field {}", - idx, - batch.num_columns() - )) - }) - }) - .collect::, ArrowError>>()?; - - unsafe { - // The indices and schema are produced from the same valid read schema - // at construction time, and filtering preserves column lengths. - Ok(RecordBatch::new_unchecked( - schema, - columns, - batch.num_rows(), - )) - } -} - -#[derive(Debug)] -struct PostSelectionFilterState { - mask: BooleanBuffer, - position: usize, -} - -impl PostSelectionFilterState { - fn new(selection: RowSelection) -> Self { - Self { - mask: selection.boolean_mask(), - position: 0, - } - } - - fn apply(&mut self, batch: RecordBatch) -> Result { - let input_rows = batch.num_rows(); - let end = self.position.saturating_add(input_rows); - if end > self.mask.len() { - return Err(general_err!( - "post-selection filter exceeded selection length: end {end}, selection length {}", - self.mask.len() - )); - } - - let filter = BooleanArray::from(self.mask.slice(self.position, input_rows)); - self.position = end; - Ok(filter_record_batch(&batch, &filter)?) - } -} - -fn projection_indices( - parquet_schema: &SchemaDescriptor, - read_projection: &ProjectionMask, - target_projection: &ProjectionMask, -) -> Result> { - let mut indices = Vec::new(); - let mut read_idx = 0; - - for leaf_idx in 0..parquet_schema.num_columns() { - if read_projection.leaf_included(leaf_idx) { - let root = parquet_schema.get_column_root(leaf_idx); - if !root.is_primitive() { - return Err(general_err!( - "post-filter fallback does not support nested read column {}", - root.name() - )); - } - if target_projection.leaf_included(leaf_idx) { - indices.push(read_idx); - } - read_idx += 1; - } else if target_projection.leaf_included(leaf_idx) { - return Err(general_err!( - "post-filter target projection includes leaf column {leaf_idx} not present in read projection" - )); - } - } - - Ok(indices) -} - impl Debug for ParquetRecordBatchReader { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("ParquetRecordBatchReader") diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs new file mode 100644 index 000000000000..913d73275884 --- /dev/null +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Post-decode filtering support for parquet row-filter fallback. + +use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::{RowFilter, RowSelection}; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::SchemaDescriptor; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_buffer::BooleanBuffer; +use arrow_schema::{ArrowError, Schema, SchemaRef}; +use arrow_select::filter::filter_record_batch; +use std::sync::{Arc, Mutex}; + +#[derive(Debug)] +pub(super) struct PostFilterState { + filter: Arc>, + predicate_projection_indices: Vec>, + predicate_projection_schemas: Vec, + output_projection_indices: Vec, + pub(super) output_schema: SchemaRef, +} + +impl PostFilterState { + pub(super) fn try_new( + filter: Arc>, + parquet_schema: &SchemaDescriptor, + read_schema: &Schema, + read_projection: &ProjectionMask, + output_projection: &ProjectionMask, + ) -> Result { + let filter_guard = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + let predicate_projection_indices = filter_guard + .predicates + .iter() + .map(|predicate| { + projection_indices(parquet_schema, read_projection, predicate.projection()) + }) + .collect::>>()?; + drop(filter_guard); + + let predicate_projection_schemas = predicate_projection_indices + .iter() + .map(|indices| read_schema.project(indices).map(SchemaRef::new)) + .collect::, _>>()?; + + let output_projection_indices = + projection_indices(parquet_schema, read_projection, output_projection)?; + let output_schema = SchemaRef::new(read_schema.project(&output_projection_indices)?); + + Ok(Self { + filter, + predicate_projection_indices, + predicate_projection_schemas, + output_projection_indices, + output_schema, + }) + } + + pub(super) fn apply(&mut self, mut batch: RecordBatch) -> Result { + let mut filter = self.filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + for (predicate_idx, (predicate, projection_indices)) in filter + .predicates + .iter_mut() + .zip(self.predicate_projection_indices.iter()) + .enumerate() + { + let input_rows = batch.num_rows(); + let predicate_batch = project_record_batch( + &batch, + projection_indices, + Arc::clone(&self.predicate_projection_schemas[predicate_idx]), + )?; + let predicate_filter = predicate.evaluate(predicate_batch)?; + + if predicate_filter.len() != input_rows { + return Err(general_err!( + "ArrowPredicate predicate returned {} rows, expected {input_rows}", + predicate_filter.len() + )); + } + + batch = filter_record_batch(&batch, &predicate_filter)?; + if batch.num_rows() == 0 { + break; + } + } + + Ok(project_record_batch( + &batch, + &self.output_projection_indices, + Arc::clone(&self.output_schema), + )?) + } +} + +#[derive(Debug)] +pub(super) struct PostSelectionFilterState { + mask: BooleanBuffer, + position: usize, +} + +impl PostSelectionFilterState { + pub(super) fn new(selection: RowSelection) -> Self { + Self { + mask: selection.boolean_mask(), + position: 0, + } + } + + pub(super) fn apply(&mut self, batch: RecordBatch) -> Result { + let input_rows = batch.num_rows(); + let end = self.position.saturating_add(input_rows); + if end > self.mask.len() { + return Err(general_err!( + "post-selection filter exceeded selection length: end {end}, selection length {}", + self.mask.len() + )); + } + + let filter = BooleanArray::from(self.mask.slice(self.position, input_rows)); + self.position = end; + Ok(filter_record_batch(&batch, &filter)?) + } +} + +#[inline(always)] +fn project_record_batch( + batch: &RecordBatch, + indices: &[usize], + schema: SchemaRef, +) -> std::result::Result { + if indices.len() == batch.num_columns() && indices.iter().copied().eq(0..batch.num_columns()) { + debug_assert_eq!(batch.schema_ref().as_ref(), schema.as_ref()); + return Ok(batch.clone()); + } + + let columns = indices + .iter() + .map(|idx| { + batch.columns().get(*idx).cloned().ok_or_else(|| { + ArrowError::SchemaError(format!( + "project index {} out of bounds, max field {}", + idx, + batch.num_columns() + )) + }) + }) + .collect::, ArrowError>>()?; + + unsafe { + // The indices and schema are produced from the same valid read schema + // at construction time, and filtering preserves column lengths. + Ok(RecordBatch::new_unchecked( + schema, + columns, + batch.num_rows(), + )) + } +} + +fn projection_indices( + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + target_projection: &ProjectionMask, +) -> Result> { + let mut indices = Vec::new(); + let mut read_idx = 0; + + for leaf_idx in 0..parquet_schema.num_columns() { + if read_projection.leaf_included(leaf_idx) { + let root = parquet_schema.get_column_root(leaf_idx); + if !root.is_primitive() { + return Err(general_err!( + "post-filter fallback does not support nested read column {}", + root.name() + )); + } + if target_projection.leaf_included(leaf_idx) { + indices.push(read_idx); + } + read_idx += 1; + } else if target_projection.leaf_included(leaf_idx) { + return Err(general_err!( + "post-filter target projection includes leaf column {leaf_idx} not present in read projection" + )); + } + } + + Ok(indices) +} diff --git a/parquet/src/arrow/push_decoder/reader_builder/fallback.rs b/parquet/src/arrow/push_decoder/reader_builder/fallback.rs new file mode 100644 index 000000000000..9f2e13311779 --- /dev/null +++ b/parquet/src/arrow/push_decoder/reader_builder/fallback.rs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Runtime post-filter fallback decisions for push decoder row groups. + +use super::RowGroupReaderBuilder; +use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::RowFilter; +use crate::arrow::arrow_reader::RowSelectionPolicy; +use crate::arrow::arrow_reader::selection::{ + FallbackObservation, FallbackTriggerReason, RowSelectionShape, RowSelectionStrategyDecision, +}; +use crate::arrow::schema::{ParquetField, ParquetFieldType}; + +#[allow(dead_code)] +#[derive(Debug)] +pub(super) enum RowGroupFallbackState { + Observing { observation: FallbackObservation }, + UsePushdown, + UsePostFilter { reason: FallbackTriggerReason }, +} + +impl Default for RowGroupFallbackState { + fn default() -> Self { + Self::Observing { + observation: FallbackObservation::default(), + } + } +} + +impl RowGroupReaderBuilder { + pub(super) fn should_use_post_filter_fallback(&self) -> bool { + matches!( + self.fallback_state, + RowGroupFallbackState::UsePostFilter { .. } + ) && self.post_filter_fallback_enabled + && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + && self.limit.is_none() + && self.offset.is_none() + && !self.has_virtual_columns() + } + + pub(super) fn post_filter_read_projection(&self, filter: &RowFilter) -> Option { + if !self.should_use_post_filter_fallback() { + return None; + } + + self.build_post_filter_read_projection(filter) + } + + fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { + let mut read_projection = self.projection.clone(); + read_projection.union(&filter.union_projection()?); + + if self.post_filter_supports_projection(&read_projection) { + Some(read_projection) + } else { + None + } + } + + fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()).all(|leaf_idx| { + !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() + }) + } + + pub(super) fn observe_fallback_candidate( + &mut self, + decision: RowSelectionStrategyDecision, + row_count: usize, + ) { + if !matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) { + return; + } + + let RowGroupFallbackState::Observing { observation } = &mut self.fallback_state else { + return; + }; + + let mut shape = decision.shape; + if shape.total_rows() == 0 { + shape = RowSelectionShape { + selected_rows: row_count, + skipped_rows: 0, + selector_count: 1, + selected_run_count: 1, + skipped_run_count: 0, + }; + } + + observation.observed_row_groups += 1; + observation.shape.add_assign(shape); + self.metrics.record_fallback_observed_row_group(); + + let reason = observation.trigger_reason(); + if matches!(reason, FallbackTriggerReason::ObservationIncomplete) { + self.metrics.record_fallback_trigger(reason); + return; + } + + let should_fallback = observation.should_fallback(); + self.metrics.record_fallback_trigger(reason); + + if should_fallback && self.post_filter_fallback_supported() { + self.fallback_state = RowGroupFallbackState::UsePostFilter { reason }; + } else { + self.fallback_state = RowGroupFallbackState::UsePushdown; + } + } + + pub(super) fn post_filter_fallback_supported(&self) -> bool { + let Some(filter) = self.filter.as_ref() else { + return false; + }; + self.post_filter_fallback_enabled + && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + && self.limit.is_none() + && self.offset.is_none() + && !self.has_virtual_columns() + && self.build_post_filter_read_projection(filter).is_some() + } + + fn has_virtual_columns(&self) -> bool { + self.fields + .as_deref() + .is_some_and(parquet_field_has_virtual_columns) + } +} + +fn parquet_field_has_virtual_columns(field: &ParquetField) -> bool { + match &field.field_type { + ParquetFieldType::Primitive { .. } => false, + ParquetFieldType::Group { children } => { + children.iter().any(parquet_field_has_virtual_columns) + } + ParquetFieldType::Virtual(_) => true, + } +} diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 5acce23d0ac9..f446274f1b57 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -16,27 +16,33 @@ // under the License. mod data; +mod fallback; mod filter; +mod selection_policy; use crate::DecodeResult; use crate::arrow::ProjectionMask; use crate::arrow::array_reader::{ArrayReaderBuilder, CacheOptions, RowGroupCache}; use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; -use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, LoadedRowRanges, RowGroupExecutionMode, - RowSelectionShape, RowSelectionStrategy, RowSelectionStrategyDecision, -}; +use crate::arrow::arrow_reader::selection::RowGroupExecutionMode; use crate::arrow::arrow_reader::{ ParquetRecordBatchReader, PredicateOptions, ReadPlanBuilder, RowFilter, RowSelection, RowSelectionPolicy, RowSelector, }; use crate::arrow::in_memory_row_group::ColumnChunkData; use crate::arrow::push_decoder::reader_builder::data::DataRequestBuilder; +use crate::arrow::push_decoder::reader_builder::fallback::RowGroupFallbackState; use crate::arrow::push_decoder::reader_builder::filter::CacheInfo; -use crate::arrow::schema::{ParquetField, ParquetFieldType}; -use crate::basic::Type as PhysicalType; +use crate::arrow::push_decoder::reader_builder::selection_policy::{ + ExpensiveOutputProfile, resolve_selection_policy_for_expensive_output, +}; +#[cfg(test)] +use crate::arrow::push_decoder::reader_builder::selection_policy::{ + loaded_ranges_for_projection, resolve_selection_policy_for_projection, +}; +use crate::arrow::schema::ParquetField; use crate::errors::ParquetError; -use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use crate::file::metadata::ParquetMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::util::push_buffers::PushBuffers; use bytes::Bytes; @@ -108,22 +114,6 @@ enum RowGroupDecoderState { Finished, } -#[allow(dead_code)] -#[derive(Debug)] -enum RowGroupFallbackState { - Observing { observation: FallbackObservation }, - UsePushdown, - UsePostFilter { reason: FallbackTriggerReason }, -} - -impl Default for RowGroupFallbackState { - fn default() -> Self { - Self::Observing { - observation: FallbackObservation::default(), - } - } -} - /// Result of a state transition #[derive(Debug)] struct NextState { @@ -1027,105 +1017,6 @@ impl RowGroupReaderBuilder { )) } - fn should_use_post_filter_fallback(&self) -> bool { - matches!( - self.fallback_state, - RowGroupFallbackState::UsePostFilter { .. } - ) && self.post_filter_fallback_enabled - && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) - && self.limit.is_none() - && self.offset.is_none() - && !self.has_virtual_columns() - } - - fn post_filter_read_projection(&self, filter: &RowFilter) -> Option { - if !self.should_use_post_filter_fallback() { - return None; - } - - self.build_post_filter_read_projection(filter) - } - - fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { - let mut read_projection = self.projection.clone(); - read_projection.union(&filter.union_projection()?); - - if self.post_filter_supports_projection(&read_projection) { - Some(read_projection) - } else { - None - } - } - - fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { - let schema = self.metadata.file_metadata().schema_descr(); - (0..schema.num_columns()).all(|leaf_idx| { - !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() - }) - } - - fn observe_fallback_candidate( - &mut self, - decision: RowSelectionStrategyDecision, - row_count: usize, - ) { - if !matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) { - return; - } - - let RowGroupFallbackState::Observing { observation } = &mut self.fallback_state else { - return; - }; - - let mut shape = decision.shape; - if shape.total_rows() == 0 { - shape = RowSelectionShape { - selected_rows: row_count, - skipped_rows: 0, - selector_count: 1, - selected_run_count: 1, - skipped_run_count: 0, - }; - } - - observation.observed_row_groups += 1; - observation.shape.add_assign(shape); - self.metrics.record_fallback_observed_row_group(); - - let reason = observation.trigger_reason(); - if matches!(reason, FallbackTriggerReason::ObservationIncomplete) { - self.metrics.record_fallback_trigger(reason); - return; - } - - let should_fallback = observation.should_fallback(); - self.metrics.record_fallback_trigger(reason); - - if should_fallback && self.post_filter_fallback_supported() { - self.fallback_state = RowGroupFallbackState::UsePostFilter { reason }; - } else { - self.fallback_state = RowGroupFallbackState::UsePushdown; - } - } - - fn post_filter_fallback_supported(&self) -> bool { - let Some(filter) = self.filter.as_ref() else { - return false; - }; - self.post_filter_fallback_enabled - && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) - && self.limit.is_none() - && self.offset.is_none() - && !self.has_virtual_columns() - && self.build_post_filter_read_projection(filter).is_some() - } - - fn has_virtual_columns(&self) -> bool { - self.fields - .as_deref() - .is_some_and(parquet_field_has_virtual_columns) - } - /// Which columns should be cached? /// /// Returns the columns that are used by the filters *and* then used in the @@ -1166,192 +1057,10 @@ impl RowGroupReaderBuilder { } } -fn parquet_field_has_virtual_columns(field: &ParquetField) -> bool { - match &field.field_type { - ParquetFieldType::Primitive { .. } => false, - ParquetFieldType::Group { children } => { - children.iter().any(parquet_field_has_virtual_columns) - } - ParquetFieldType::Virtual(_) => true, - } -} - -#[cfg(test)] -fn resolve_selection_policy_for_projection( - plan_builder: ReadPlanBuilder, - projection_mask: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, - total_rows: usize, -) -> ReadPlanBuilder { - resolve_selection_policy_for_expensive_output( - plan_builder, - projection_mask, - offset_index, - total_rows, - ExpensiveOutputProfile::default(), - ) -} - -fn resolve_selection_policy_for_expensive_output( - plan_builder: ReadPlanBuilder, - projection_mask: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, - total_rows: usize, - output_profile: ExpensiveOutputProfile, -) -> ReadPlanBuilder { - let loaded = loaded_ranges_for_projection( - plan_builder.selection(), - projection_mask, - offset_index, - total_rows, - ); - let loaded_is_sparse = loaded.as_ref().is_some_and(LoadedRowRanges::is_sparse); - let sparse_loaded = loaded.filter(LoadedRowRanges::is_sparse); - - match plan_builder.row_selection_policy() { - RowSelectionPolicy::Auto { .. } => { - let decision = plan_builder.resolve_selection_strategy_decision(); - match decision.strategy { - RowSelectionStrategy::Mask - if loaded_is_sparse - || should_prefer_selectors_for_expensive_output( - decision.shape, - output_profile, - ) => - { - plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) - } - RowSelectionStrategy::Mask => { - plan_builder.with_row_selection_policy(RowSelectionPolicy::Mask) - } - RowSelectionStrategy::Selectors => { - plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) - } - } - } - RowSelectionPolicy::Mask => plan_builder.with_loaded_row_ranges(sparse_loaded), - RowSelectionPolicy::Selectors => plan_builder, - } -} - -#[derive(Clone, Copy, Debug, Default)] -struct ExpensiveOutputProfile { - variable_width_columns: usize, - uncompressed_bytes_per_row: f64, -} - -impl ExpensiveOutputProfile { - fn from_row_group( - row_group: &RowGroupMetaData, - projection_mask: &ProjectionMask, - total_rows: usize, - ) -> Self { - if total_rows == 0 { - return Self::default(); - } - - let mut variable_width_columns = 0; - let mut uncompressed_bytes = 0u64; - for leaf_idx in 0..row_group.num_columns() { - if !projection_mask.leaf_included(leaf_idx) { - continue; - } - - let column = row_group.column(leaf_idx); - if column.column_type() == PhysicalType::BYTE_ARRAY { - variable_width_columns += 1; - } - uncompressed_bytes += column.uncompressed_size().max(0) as u64; - } - - Self { - variable_width_columns, - uncompressed_bytes_per_row: uncompressed_bytes as f64 / total_rows as f64, - } - } -} - -fn should_prefer_selectors_for_expensive_output( - shape: RowSelectionShape, - output_profile: ExpensiveOutputProfile, -) -> bool { - let selected_ratio = shape.selected_ratio(); - output_profile.variable_width_columns > 0 - && output_profile.uncompressed_bytes_per_row >= 16.0 - && selected_ratio > 0.0 - && selected_ratio < 0.10 - && shape.average_selected_run_length() <= 4.0 -} - -fn loaded_ranges_for_projection( - selection: Option<&RowSelection>, - projection_mask: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, - total_rows: usize, -) -> Option { - let selection = selection?; - let columns = offset_index?; - let mut ranges: Option>> = None; - - for (leaf_idx, column) in columns.iter().enumerate() { - if !projection_mask.leaf_included(leaf_idx) { - continue; - } - let column_ranges = selection.selected_page_row_ranges(column.page_locations(), total_rows); - ranges = Some(match ranges { - Some(existing) => intersect_ranges(existing, column_ranges), - None => column_ranges, - }); - } - - ranges.map(|ranges| LoadedRowRanges::new(coalesce_adjacent_ranges(ranges), total_rows)) -} - -fn intersect_ranges(left: Vec>, right: Vec>) -> Vec> { - let mut out = Vec::new(); - let mut left_idx = 0; - let mut right_idx = 0; - - while left_idx < left.len() && right_idx < right.len() { - let l = &left[left_idx]; - let r = &right[right_idx]; - let start = l.start.max(r.start); - let end = l.end.min(r.end); - - if start < end { - out.push(start..end); - } - - if l.end <= r.end { - left_idx += 1; - } else { - right_idx += 1; - } - } - - out -} - -fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { - let mut out: Vec> = Vec::with_capacity(ranges.len()); - for range in ranges { - if range.is_empty() { - continue; - } - if let Some(last) = out.last_mut() { - if last.end == range.start { - last.end = range.end; - continue; - } - } - out.push(range); - } - out -} - #[cfg(test)] mod tests { use super::*; + use crate::arrow::arrow_reader::selection::LoadedRowRanges; use crate::arrow::arrow_reader::{RowSelection, RowSelectionCursor, RowSelector}; use crate::file::page_index::offset_index::PageLocation; diff --git a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs new file mode 100644 index 000000000000..90510ed8f240 --- /dev/null +++ b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Row-selection policy resolution for push decoder read plans. + +use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::selection::{ + LoadedRowRanges, RowSelectionShape, RowSelectionStrategy, +}; +use crate::arrow::arrow_reader::{ReadPlanBuilder, RowSelection, RowSelectionPolicy}; +use crate::basic::Type as PhysicalType; +use crate::file::metadata::RowGroupMetaData; +use crate::file::page_index::offset_index::OffsetIndexMetaData; +use std::ops::Range; + +#[cfg(test)] +pub(super) fn resolve_selection_policy_for_projection( + plan_builder: ReadPlanBuilder, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, +) -> ReadPlanBuilder { + resolve_selection_policy_for_expensive_output( + plan_builder, + projection_mask, + offset_index, + total_rows, + ExpensiveOutputProfile::default(), + ) +} + +pub(super) fn resolve_selection_policy_for_expensive_output( + plan_builder: ReadPlanBuilder, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, + output_profile: ExpensiveOutputProfile, +) -> ReadPlanBuilder { + let loaded = loaded_ranges_for_projection( + plan_builder.selection(), + projection_mask, + offset_index, + total_rows, + ); + let loaded_is_sparse = loaded.as_ref().is_some_and(LoadedRowRanges::is_sparse); + let sparse_loaded = loaded.filter(LoadedRowRanges::is_sparse); + + match plan_builder.row_selection_policy() { + RowSelectionPolicy::Auto { .. } => { + let decision = plan_builder.resolve_selection_strategy_decision(); + match decision.strategy { + RowSelectionStrategy::Mask + if loaded_is_sparse + || should_prefer_selectors_for_expensive_output( + decision.shape, + output_profile, + ) => + { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + RowSelectionStrategy::Mask => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Mask) + } + RowSelectionStrategy::Selectors => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + } + } + RowSelectionPolicy::Mask => plan_builder.with_loaded_row_ranges(sparse_loaded), + RowSelectionPolicy::Selectors => plan_builder, + } +} + +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct ExpensiveOutputProfile { + pub(super) variable_width_columns: usize, + pub(super) uncompressed_bytes_per_row: f64, +} + +impl ExpensiveOutputProfile { + pub(super) fn from_row_group( + row_group: &RowGroupMetaData, + projection_mask: &ProjectionMask, + total_rows: usize, + ) -> Self { + if total_rows == 0 { + return Self::default(); + } + + let mut variable_width_columns = 0; + let mut uncompressed_bytes = 0u64; + for leaf_idx in 0..row_group.num_columns() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + variable_width_columns += 1; + } + uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + Self { + variable_width_columns, + uncompressed_bytes_per_row: uncompressed_bytes as f64 / total_rows as f64, + } + } +} + +fn should_prefer_selectors_for_expensive_output( + shape: RowSelectionShape, + output_profile: ExpensiveOutputProfile, +) -> bool { + let selected_ratio = shape.selected_ratio(); + output_profile.variable_width_columns > 0 + && output_profile.uncompressed_bytes_per_row >= 16.0 + && selected_ratio > 0.0 + && selected_ratio < 0.10 + && shape.average_selected_run_length() <= 4.0 +} + +#[cfg_attr(test, allow(dead_code))] +pub(super) fn loaded_ranges_for_projection( + selection: Option<&RowSelection>, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, +) -> Option { + let selection = selection?; + let columns = offset_index?; + let mut ranges: Option>> = None; + + for (leaf_idx, column) in columns.iter().enumerate() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + let column_ranges = selection.selected_page_row_ranges(column.page_locations(), total_rows); + ranges = Some(match ranges { + Some(existing) => intersect_ranges(existing, column_ranges), + None => column_ranges, + }); + } + + ranges.map(|ranges| LoadedRowRanges::new(coalesce_adjacent_ranges(ranges), total_rows)) +} + +fn intersect_ranges(left: Vec>, right: Vec>) -> Vec> { + let mut out = Vec::new(); + let mut left_idx = 0; + let mut right_idx = 0; + + while left_idx < left.len() && right_idx < right.len() { + let l = &left[left_idx]; + let r = &right[right_idx]; + let start = l.start.max(r.start); + let end = l.end.min(r.end); + + if start < end { + out.push(start..end); + } + + if l.end <= r.end { + left_idx += 1; + } else { + right_idx += 1; + } + } + + out +} + +fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { + let mut out: Vec> = Vec::with_capacity(ranges.len()); + for range in ranges { + if range.is_empty() { + continue; + } + if let Some(last) = out.last_mut() { + if last.end == range.start { + last.end = range.end; + continue; + } + } + out.push(range); + } + out +} From 5db1ea7467457b23669836d57c9e37bf2c3cb2e3 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 10 May 2026 22:56:14 +0800 Subject: [PATCH 06/32] docs(parquet): explain row filter fallback design --- parquet/src/arrow/arrow_reader/post_filter.rs | 39 ++++++++++++++++ parquet/src/arrow/arrow_reader/selection.rs | 38 ++++++++++++++++ .../push_decoder/reader_builder/fallback.rs | 44 +++++++++++++++++++ .../reader_builder/selection_policy.rs | 38 ++++++++++++++++ 4 files changed, 159 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs index 913d73275884..7e533703c4e7 100644 --- a/parquet/src/arrow/arrow_reader/post_filter.rs +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -16,6 +16,27 @@ // under the License. //! Post-decode filtering support for parquet row-filter fallback. +//! +//! Normal predicate pushdown decodes predicate columns first, builds a +//! `RowSelection`, and then decodes output columns for selected rows. The +//! fallback path in this module instead decodes the union of predicate and +//! output columns once and applies predicates after decode. +//! +//! ```text +//! read projection = output columns UNION predicate columns +//! | +//! v +//! decode RecordBatch +//! | +//! +-- predicate 1 --> filter batch +//! +-- predicate 2 --> filter batch +//! | +//! v +//! project original output columns +//! ``` +//! +//! This is profitable for shapes where row-level pushdown has high overhead +//! and little pruning, especially fragmented high-selectivity selections. use crate::arrow::ProjectionMask; use crate::arrow::arrow_reader::{RowFilter, RowSelection}; @@ -44,6 +65,9 @@ impl PostFilterState { read_projection: &ProjectionMask, output_projection: &ProjectionMask, ) -> Result { + // Projection indices are computed once when constructing the reader. + // Each predicate sees only the columns it requested, while the caller + // receives only the original output projection after all predicates run. let filter_guard = filter.lock().map_err(|_| { ParquetError::General("post-filter predicate state was poisoned".to_string()) })?; @@ -80,6 +104,9 @@ impl PostFilterState { ParquetError::General("post-filter predicate state was poisoned".to_string()) })?; + // Apply predicates in the same order as RowFilter pushdown. Each + // predicate is evaluated against the currently surviving rows, so later + // predicates do not do work for rows already rejected by earlier ones. for (predicate_idx, (predicate, projection_indices)) in filter .predicates .iter_mut() @@ -130,6 +157,10 @@ impl PostSelectionFilterState { } pub(super) fn apply(&mut self, batch: RecordBatch) -> Result { + // This path is not predicate post-filtering. It is used after pushdown + // has already computed a final RowSelection for the current row group, + // but fallback chooses to decode the base selection and apply that + // already-computed selection after decode. let input_rows = batch.num_rows(); let end = self.position.saturating_add(input_rows); if end > self.mask.len() { @@ -185,6 +216,14 @@ fn projection_indices( read_projection: &ProjectionMask, target_projection: &ProjectionMask, ) -> Result> { + // Convert parquet leaf positions to RecordBatch column positions after the + // larger read projection has been decoded. For example: + // + // ```text + // parquet leaves: a b c d + // read projection: a c d => batch columns [a, c, d] + // target: c => target index [1] + // ``` let mut indices = Vec::new(); let mut read_idx = 0; diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index de3c453d2d3d..72c44e169b4b 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -175,14 +175,31 @@ impl RowSelectionShape { #[allow(dead_code)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum FallbackTriggerReason { + /// Predicate pushdown kept almost everything and did not produce useful pruning. HighSelectivityNoPruning, + /// Fragmented runs with moderate selectivity often pay many small skip/read costs. FragmentedModerateSelectivity, + /// Fragmented runs with high selectivity usually decode most rows plus pay pushdown overhead. FragmentedHighSelectivity, + /// Not enough row groups have been observed to classify the scan. ObservationIncomplete, + /// The observed shape still looks suitable for predicate pushdown. PushdownStillPreferred, + /// The caller forced a concrete row-selection policy. ForcedPolicy, } +/// Aggregate row-selection shape observed while deciding whether Auto should +/// continue predicate pushdown or fall back to post-filter execution. +/// +/// The classifier looks for shapes where row-level pushdown is unlikely to +/// recover its own overhead: +/// +/// ```text +/// no skipped rows -> predicate did not prune +/// tiny selected runs + many runs -> fragmented skip/read pattern +/// high selected ratio -> most output rows are decoded anyway +/// ``` #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) struct FallbackObservation { pub(crate) observed_row_groups: usize, @@ -1145,12 +1162,18 @@ pub struct MaskChunk { #[derive(Debug, Clone, Eq, PartialEq)] pub(crate) struct LoadedRowRanges { + /// Absolute row-group ranges for which all projected columns have backing + /// page data loaded in memory. ranges: Vec>, + /// Total row count of the row group the ranges are relative to. total_rows: usize, } impl LoadedRowRanges { pub(crate) fn new(ranges: Vec>, total_rows: usize) -> Self { + // Sparse-mask execution indexes masks by absolute row-group position. + // Keep loaded ranges sorted and non-overlapping so range containment is + // unambiguous and the reader can move forward without rewinding. debug_assert!( ranges .windows(2) @@ -1183,8 +1206,11 @@ impl LoadedRowRanges { #[derive(Debug, Clone, Eq, PartialEq)] pub(crate) struct MaskSegment { + /// Absolute row-group range to decode from the array reader. pub row_range: Range, + /// Starting bit in the absolute row-group mask for this segment. pub mask_start: usize, + /// Number of mask bits to apply to `row_range`. pub mask_len: usize, } @@ -1196,8 +1222,11 @@ pub(crate) struct SparseMaskChunk { #[derive(Debug)] pub(crate) struct SparseMaskCursor { + /// Boolean mask indexed by absolute row-group position. mask: BooleanBuffer, + /// Absolute row ranges whose data pages are present for the projection. loaded: LoadedRowRanges, + /// Current absolute row-group position in `mask`. position: usize, } @@ -1250,11 +1279,20 @@ impl SparseMaskCursor { } let Some(loaded) = self.loaded.range_containing(cursor) else { + // A selected row outside loaded ranges means the read plan asks + // Mask to materialize a row whose page data was pruned away. + // Returning an internal error is safer than silently producing + // incorrect rows. return Err(ParquetError::General(format!( "Internal Error: sparse mask selected row {cursor} outside loaded row ranges" ))); }; + // Build the largest contiguous selected segment that stays within + // the current loaded range and does not exceed the output batch + // size. The record batch reader will skip to `row_range.start`, + // read exactly `row_range.len()` rows, and then apply this mask + // slice to the decoded batch. let segment_start = cursor; let mut segment_end = cursor; while segment_end < loaded.end diff --git a/parquet/src/arrow/push_decoder/reader_builder/fallback.rs b/parquet/src/arrow/push_decoder/reader_builder/fallback.rs index 9f2e13311779..ed4b3cd1d540 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/fallback.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/fallback.rs @@ -16,6 +16,26 @@ // under the License. //! Runtime post-filter fallback decisions for push decoder row groups. +//! +//! The fallback is intentionally adaptive rather than purely static. The first +//! eligible row group is evaluated with predicate pushdown so the reader can +//! observe the actual `RowSelection` shape produced by the predicate chain. +//! Later row groups may then switch to post-filter execution if the observed +//! shape suggests pushdown is doing extra work without pruning enough rows. +//! +//! ```text +//! Start +//! | +//! v +//! Observing -- incomplete observation --> Observing +//! | +//! +-- pushdown still preferred ------> UsePushdown +//! | +//! +-- fallback trigger + supported --> UsePostFilter +//! ``` +//! +//! Fallback only applies to `Auto`. Explicit `Mask` and `Selectors` are treated +//! as user intent and are not overridden here. use super::RowGroupReaderBuilder; use crate::arrow::ProjectionMask; @@ -29,8 +49,11 @@ use crate::arrow::schema::{ParquetField, ParquetFieldType}; #[allow(dead_code)] #[derive(Debug)] pub(super) enum RowGroupFallbackState { + /// Collect row-selection shape from early row groups before choosing a mode. Observing { observation: FallbackObservation }, + /// Predicate pushdown remains the execution mode for this reader. UsePushdown, + /// Later row groups should decode once and evaluate predicates after decode. UsePostFilter { reason: FallbackTriggerReason }, } @@ -44,6 +67,13 @@ impl Default for RowGroupFallbackState { impl RowGroupReaderBuilder { pub(super) fn should_use_post_filter_fallback(&self) -> bool { + // Keep the runtime switch narrow: + // + // * `Auto` means the caller allowed the reader to choose. + // * `limit` and `offset` are applied during row-group planning; moving + // predicates after decode changes where short-circuiting can happen. + // * virtual columns are not read from Parquet pages and need their + // existing projection path. matches!( self.fallback_state, RowGroupFallbackState::UsePostFilter { .. } @@ -63,6 +93,13 @@ impl RowGroupReaderBuilder { } fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { + // Post-filter execution decodes each row once, so it needs both: + // + // * output columns, which will be returned to the caller + // * predicate columns, which are needed to evaluate the RowFilter + // + // The final reader projects back to the original output projection + // after predicate evaluation. let mut read_projection = self.projection.clone(); read_projection.union(&filter.union_projection()?); @@ -74,6 +111,10 @@ impl RowGroupReaderBuilder { } fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { + // The post-filter reader currently projects record batches by parquet + // leaf column position. Nested roots can span multiple leaves and need + // the existing array-reader projection machinery, so keep fallback to + // primitive roots only. let schema = self.metadata.file_metadata().schema_descr(); (0..schema.num_columns()).all(|leaf_idx| { !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() @@ -95,6 +136,9 @@ impl RowGroupReaderBuilder { let mut shape = decision.shape; if shape.total_rows() == 0 { + // `None` selection means the predicate kept the whole row group. + // Represent it as one selected run so the fallback classifier can + // treat "no pruning" as an observed high-selectivity case. shape = RowSelectionShape { selected_rows: row_count, skipped_rows: 0, diff --git a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs index 90510ed8f240..a17b6d320741 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs @@ -16,6 +16,26 @@ // under the License. //! Row-selection policy resolution for push decoder read plans. +//! +//! This module is the final safety gate between the high-level +//! `RowSelectionPolicy` requested by the caller and the concrete cursor used by +//! the record batch reader. It handles two independent concerns: +//! +//! ```text +//! Caller policy Selection/page shape Resolved plan +//! ------------------------------------------------------------------------------- +//! Auto dense, short/fragmented runs Mask +//! Auto sparse page-loaded ranges Selectors +//! Auto expensive variable-width sparse output Selectors +//! Mask dense page-loaded ranges dense Mask +//! Mask sparse page-loaded ranges SparseMaskCursor +//! Selectors any shape Selectors +//! ``` +//! +//! The distinction between `Auto` and explicit `Mask` matters. `Auto` may +//! choose selectors to avoid a bad strategy. Explicit `Mask` must be honored, +//! so sparse page-loaded data is represented explicitly instead of being +//! silently converted to selectors. use crate::arrow::ProjectionMask; use crate::arrow::arrow_reader::selection::{ @@ -50,6 +70,10 @@ pub(super) fn resolve_selection_policy_for_expensive_output( total_rows: usize, output_profile: ExpensiveOutputProfile, ) -> ReadPlanBuilder { + // Page pruning can load only the pages that intersect selected rows. If the + // projected columns have sparse loaded ranges, a dense mask would try to + // decode rows for pages that are not present. Auto avoids that by choosing + // selectors; explicit Mask carries the sparse ranges to the reader. let loaded = loaded_ranges_for_projection( plan_builder.selection(), projection_mask, @@ -126,6 +150,10 @@ fn should_prefer_selectors_for_expensive_output( shape: RowSelectionShape, output_profile: ExpensiveOutputProfile, ) -> bool { + // Sparse, low-selectivity output over variable-width columns can be worse + // with masks because masks decode and then filter many values that selectors + // can skip. This is intentionally narrow; most fragmented selections remain + // good candidates for masks. let selected_ratio = shape.selected_ratio(); output_profile.variable_width_columns > 0 && output_profile.uncompressed_bytes_per_row >= 16.0 @@ -141,6 +169,16 @@ pub(super) fn loaded_ranges_for_projection( offset_index: Option<&[OffsetIndexMetaData]>, total_rows: usize, ) -> Option { + // Loaded ranges are row ranges backed by page data for all projected + // columns. When projections include multiple columns, a row is safe for + // sparse-mask decoding only if every projected column loaded the page that + // contains that row. Therefore projected-column ranges are intersected. + // + // ```text + // column A pages loaded: [0..50) [80..100) + // column B pages loaded: [20..70) [80..100) + // usable loaded ranges: [20..50) [80..100) + // ``` let selection = selection?; let columns = offset_index?; let mut ranges: Option>> = None; From a80921575c36dab342796ea290f25f534746a754 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sat, 16 May 2026 18:45:01 +0800 Subject: [PATCH 07/32] docs: design row filter fallback readability refactor --- ...-row-filter-fallback-readability-design.md | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md diff --git a/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md b/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md new file mode 100644 index 000000000000..0c55fe3a67c9 --- /dev/null +++ b/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md @@ -0,0 +1,107 @@ +# Row Filter Fallback Readability Refactor Design + +## Context + +The current branch adds adaptive post-filter fallback for the async Parquet row-filter path. The core design is already split into focused modules: + +- `arrow_reader/post_filter.rs` owns post-decode filtering. +- `push_decoder/reader_builder/fallback.rs` owns fallback eligibility and observation. +- `push_decoder/reader_builder/selection_policy.rs` owns row-selection policy resolution. + +The remaining readability issue is in `push_decoder/reader_builder/mod.rs`, especially the `WaitingOnData` branch. That branch currently mixes state-machine progression with fallback observation, metrics recording, post-selection handoff, and post-filter state initialization. + +## Goal + +Reduce the cognitive load in `push_decoder/reader_builder/mod.rs` without changing runtime behavior. + +This refactor should make the row-group state machine read in a high-level order: + +1. Resolve any fallback transition. +2. Return early if the current row group should switch to post-selection filtering. +3. Wait for missing data if necessary. +4. Build the normal pushdown reader. + +## Non-Goals + +- Do not change fallback heuristics or thresholds. +- Do not change metrics semantics or counter names. +- Do not change benchmark behavior. +- Do not move the full fallback implementation into a new module in this step. +- Do not broaden fallback eligibility. + +## Proposed Design + +Add a small private transition enum near `RowGroupReaderBuilder`: + +```rust +enum FallbackTransition { + ContinuePushdown, + StartPostSelection { + selection: RowSelection, + }, + EnablePostFilter, +} +``` + +Add two private helper methods on `RowGroupReaderBuilder`: + +```rust +fn resolve_fallback_transition( + &mut self, + row_group_info: &RowGroupInfo, + cache_info: Option<&CacheInfo>, +) -> Result +``` + +```rust +fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> +``` + +`resolve_fallback_transition` owns the current fallback block from the `WaitingOnData` branch: + +- Check whether fallback observation is active and supported. +- Resolve the `RowSelectionStrategyDecision`. +- Capture the observed pushdown selection. +- Call `observe_fallback_candidate`. +- If fallback switches on and there is no base selection, return `StartPostSelection`. +- If fallback switches on and there is a base selection, initialize post-filter state and return `EnablePostFilter`. +- Otherwise record the pushdown fallback metric and return `ContinuePushdown`. + +The helper should not consume `data_request` or `cache_info`. If it returns `StartPostSelection`, the caller remains responsible for moving the existing values into: + +```rust +self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + data_request.into_dense_column_chunks(), +) +``` + +`ensure_post_filter_state` owns only the state transition from `self.filter` into `self.post_filter`. This keeps the main state-machine branch from handling predicate-state ownership directly. + +## Naming and Comments + +While extracting the helper, improve local names without broad churn: + +- Rename `fallback_selection` to `observed_selection` or `pushdown_selection`. +- Clarify that the `post_filter` field stores predicate state reused by later row groups after fallback selects post-filter execution. +- Add a short comment before `StartPostSelection` explaining that the current row group already computed a selection, so it applies that selection after decode instead of re-running predicates. + +## Expected Result + +The `WaitingOnData` branch should become shorter and easier to scan. Fallback details should be readable in one helper, while the main branch remains focused on the row-group state machine. + +The refactor should preserve the existing module structure and keep the diff reviewable. + +## Verification + +Run these commands after implementation: + +```bash +cargo test -p parquet --lib arrow::push_decoder::reader_builder::tests +cargo test -p parquet --test arrow_reader --features arrow,async row_filter::async +cargo fmt --all --check +``` + +If the second command is too broad or the local test target names differ, run the closest existing async row-filter integration test command and report the exact command used. From f747e49f797ca80954df5457cfee2cf118ab25b7 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sat, 16 May 2026 20:25:52 +0800 Subject: [PATCH 08/32] refactor(parquet): clarify row filter fallback transition --- ...6-05-16-row-filter-fallback-readability.md | 308 ++++++++++++++++++ .../arrow/push_decoder/reader_builder/mod.rs | 128 +++++--- 2 files changed, 384 insertions(+), 52 deletions(-) create mode 100644 docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md diff --git a/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md b/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md new file mode 100644 index 000000000000..8afeb0a60711 --- /dev/null +++ b/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md @@ -0,0 +1,308 @@ +# Row Filter Fallback Readability Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Reduce the cognitive load in the async Parquet row-filter fallback state machine without changing behavior. + +**Architecture:** Keep the existing module layout. Add a small private transition enum and two private helper methods inside `push_decoder/reader_builder/mod.rs`, then replace the inline fallback block in the `WaitingOnData` state with a short high-level match. + +**Tech Stack:** Rust, Apache Arrow Rust Parquet reader internals, existing `cargo test` and `cargo fmt`. + +--- + +## File Structure + +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs` + - Add `FallbackTransition`. + - Clarify the `post_filter` field comment. + - Add `ensure_post_filter_state`. + - Add `resolve_fallback_transition`. + - Replace the inline fallback block in `RowGroupDecoderState::WaitingOnData`. +- No new production modules. +- No new tests required because this is behavior-preserving refactoring; existing unit and async row-filter tests cover the current behavior. + +## Task 1: Add Transition Type and Post-Filter State Helper + +**Files:** +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:54-62` +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:272-276` +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:352-356` + +- [ ] **Step 1: Add `FallbackTransition` after `RowGroupInfo`** + +Insert this enum immediately after `RowGroupInfo`: + +```rust +enum FallbackTransition { + ContinuePushdown, + StartPostSelection { selection: RowSelection }, + EnablePostFilter, +} +``` + +- [ ] **Step 2: Clarify the `post_filter` field comment** + +Replace the existing field comment: + +```rust +/// Shared filter state used once Auto fallback switches to post-filter. +post_filter: Option>>, +``` + +with: + +```rust +/// Predicate state reused by later row groups once Auto fallback switches to post-filter. +post_filter: Option>>, +``` + +- [ ] **Step 3: Add `ensure_post_filter_state`** + +Add this private method in `impl RowGroupReaderBuilder`, near `disable_post_filter_fallback`: + +```rust +fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { + if self.post_filter.is_some() { + return Ok(()); + } + + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General("post-filter fallback selected without a row filter".to_string()) + })?; + self.post_filter = Some(Arc::new(Mutex::new(filter))); + Ok(()) +} +``` + +- [ ] **Step 4: Run focused formatting check** + +Run: + +```bash +cargo fmt --all --check +``` + +Expected: this may fail until all refactor steps are complete. If it fails only because the new code needs formatting, continue and run `cargo fmt --all` after Task 3. + +## Task 2: Extract Fallback Transition Resolution + +**Files:** +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:358-378` +- Reads existing logic at: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:892-943` + +- [ ] **Step 1: Add `resolve_fallback_transition`** + +Add this private method in `impl RowGroupReaderBuilder`, after `ensure_post_filter_state`: + +```rust +fn resolve_fallback_transition( + &mut self, + row_group_info: &RowGroupInfo, + cache_info: Option<&CacheInfo>, +) -> Result { + if cache_info.is_none() + || !matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) + || !self.post_filter_fallback_supported(row_group_info.budget) + { + return Ok(FallbackTransition::ContinuePushdown); + } + + let decision = row_group_info + .plan_builder + .resolve_selection_strategy_decision(); + let observed_selection = row_group_info.plan_builder.selection().cloned(); + + self.observe_fallback_candidate(decision, row_group_info.row_count, row_group_info.budget); + + if matches!( + self.fallback_state, + RowGroupFallbackState::UsePostFilter { .. } + ) { + if row_group_info.base_selection.is_none() { + let selection = observed_selection.unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) + }); + return Ok(FallbackTransition::StartPostSelection { selection }); + } + + self.ensure_post_filter_state()?; + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + return Ok(FallbackTransition::EnablePostFilter); + } + + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + Ok(FallbackTransition::ContinuePushdown) +} +``` + +- [ ] **Step 2: Verify behavior preservation in the helper** + +Check these details manually before moving on: + +- `StartPostSelection` returns before recording `fallback_pushdown_row_group_count`, matching the old early return. +- `EnablePostFilter` records `Pushdown(decision.strategy)`, matching the old fall-through behavior. +- `ContinuePushdown` records the same metric only after an observation was made. +- The helper does not consume `data_request` or `cache_info`. + +## Task 3: Replace the Inline `WaitingOnData` Fallback Block + +**Files:** +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:887-945` + +- [ ] **Step 1: Replace the inline fallback block** + +Replace the entire initial fallback-observation block at the top of the `WaitingOnData` arm. It starts with: + +```rust +if cache_info.is_some() + && matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) + && self.post_filter_fallback_supported(row_group_info.budget) +{ +} +``` + +and ends immediately before: + +```rust +let needed_ranges = data_request.needed_ranges(&self.buffers); +``` + +with: + +```rust +match self.resolve_fallback_transition(&row_group_info, cache_info.as_ref())? { + FallbackTransition::ContinuePushdown | FallbackTransition::EnablePostFilter => {} + FallbackTransition::StartPostSelection { selection } => { + let column_chunks = data_request.into_dense_column_chunks(); + // The current row group already computed a pushdown selection. Apply that + // selection after decode instead of evaluating the predicates again. + // + // Sparse predicate chunks may not cover the base selection. Dense chunks + // are safe to reuse and preserve predicate-cache IO behavior. + return self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + column_chunks, + ); + } +} +``` + +- [ ] **Step 2: Run formatter** + +Run: + +```bash +cargo fmt --all +``` + +Expected: command exits 0. + +- [ ] **Step 3: Inspect the resulting `WaitingOnData` branch** + +Run: + +```bash +nl -ba parquet/src/arrow/push_decoder/reader_builder/mod.rs | sed -n '887,945p' +``` + +Expected: the branch reads in this order: + +1. `resolve_fallback_transition` match. +2. `needed_ranges` check. +3. row group destructuring. +4. normal row group reader build. + +## Task 4: Verification + +**Files:** +- Verify only; no source edits expected. + +- [ ] **Step 1: Run push decoder reader builder tests** + +Run: + +```bash +cargo test -p parquet --lib arrow::push_decoder::reader_builder::tests +``` + +Expected: all tests pass. + +- [ ] **Step 2: Run async row-filter integration tests** + +Run: + +```bash +cargo test -p parquet --test arrow_reader --features arrow,async row_filter::async +``` + +Expected: all matching async row-filter tests pass. If this command does not match tests in this workspace, run: + +```bash +cargo test -p parquet --test arrow_reader --features arrow,async row_filter +``` + +and report the exact command and result. + +- [ ] **Step 3: Run formatting check** + +Run: + +```bash +cargo fmt --all --check +``` + +Expected: command exits 0. + +- [ ] **Step 4: Review diff** + +Run: + +```bash +git diff -- parquet/src/arrow/push_decoder/reader_builder/mod.rs +``` + +Expected: diff only introduces the transition enum, helper methods, comment improvements, and the shorter `WaitingOnData` fallback match. + +## Task 5: Commit + +**Files:** +- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs` +- Include this plan file only if the team wants planning artifacts committed. + +- [ ] **Step 1: Check status** + +Run: + +```bash +git status --short +``` + +Expected: only intentional files are changed. + +- [ ] **Step 2: Stage implementation** + +Run: + +```bash +git add parquet/src/arrow/push_decoder/reader_builder/mod.rs +``` + +If committing the plan file, also run: + +```bash +git add docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md +``` + +- [ ] **Step 3: Commit** + +Run: + +```bash +git commit -m "refactor(parquet): clarify row filter fallback transition" +``` + +Expected: commit succeeds. diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index e7c661f90312..3cdf43084040 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -61,6 +61,12 @@ struct RowGroupInfo { budget: RowBudget, } +enum FallbackTransition { + ContinuePushdown, + StartPostSelection { selection: RowSelection }, + EnablePostFilter, +} + /// This is the inner state machine for reading a single row group. #[derive(Debug)] enum RowGroupDecoderState { @@ -272,7 +278,7 @@ pub(crate) struct RowGroupReaderBuilder { /// Optional filter filter: Option, - /// Shared filter state used once Auto fallback switches to post-filter. + /// Predicate state reused by later row groups once Auto fallback switches to post-filter. post_filter: Option>>, /// The size in bytes of the predicate cache to use @@ -287,7 +293,6 @@ pub(crate) struct RowGroupReaderBuilder { row_selection_policy: RowSelectionPolicy, /// Row-group-local fallback state used by Auto policy. - #[allow(dead_code)] fallback_state: RowGroupFallbackState, /// Whether this builder may switch Auto policy to post-filter fallback. @@ -355,6 +360,59 @@ impl RowGroupReaderBuilder { self.post_filter_fallback_enabled = false; } + fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { + if self.post_filter.is_some() { + return Ok(()); + } + + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General("post-filter fallback selected without a row filter".to_string()) + })?; + self.post_filter = Some(Arc::new(Mutex::new(filter))); + Ok(()) + } + + fn resolve_fallback_transition( + &mut self, + row_group_info: &RowGroupInfo, + cache_info: Option<&CacheInfo>, + ) -> Result { + if cache_info.is_none() + || !matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) + || !self.post_filter_fallback_supported(row_group_info.budget) + { + return Ok(FallbackTransition::ContinuePushdown); + } + + let decision = row_group_info + .plan_builder + .resolve_selection_strategy_decision(); + let observed_selection = row_group_info.plan_builder.selection().cloned(); + + self.observe_fallback_candidate(decision, row_group_info.row_count, row_group_info.budget); + + if matches!( + self.fallback_state, + RowGroupFallbackState::UsePostFilter { .. } + ) { + if row_group_info.base_selection.is_none() { + let selection = observed_selection.unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) + }); + return Ok(FallbackTransition::StartPostSelection { selection }); + } + + self.ensure_post_filter_state()?; + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + return Ok(FallbackTransition::EnablePostFilter); + } + + self.metrics + .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + Ok(FallbackTransition::ContinuePushdown) + } + /// take the current state, leaving None in its place. /// /// Returns an error if there the state wasn't put back after the previous @@ -889,57 +947,23 @@ impl RowGroupReaderBuilder { data_request, cache_info, } => { - if cache_info.is_some() - && matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) - && self.post_filter_fallback_supported(row_group_info.budget) - { - let decision = row_group_info - .plan_builder - .resolve_selection_strategy_decision(); - let fallback_selection = row_group_info.plan_builder.selection().cloned(); - self.observe_fallback_candidate( - decision, - row_group_info.row_count, - row_group_info.budget, - ); - - if matches!( - self.fallback_state, - RowGroupFallbackState::UsePostFilter { .. } - ) { - if row_group_info.base_selection.is_none() { - let selection = fallback_selection.unwrap_or_else(|| { - RowSelection::from(vec![RowSelector::select( - row_group_info.row_count, - )]) - }); - let column_chunks = data_request.into_dense_column_chunks(); - // Sparse predicate chunks may not cover the base - // selection. Dense chunks are safe to reuse and - // preserve predicate-cache IO behavior. - return self.start_post_selection_filter( - row_group_info, - selection, - cache_info, - column_chunks, - ); - } - - if self.post_filter.is_none() { - let filter = self.filter.take().ok_or_else(|| { - ParquetError::General( - "post-filter fallback selected without a row filter" - .to_string(), - ) - })?; - self.post_filter = Some(Arc::new(Mutex::new(filter))); - } + match self.resolve_fallback_transition(&row_group_info, cache_info.as_ref())? { + FallbackTransition::ContinuePushdown | FallbackTransition::EnablePostFilter => { + } + FallbackTransition::StartPostSelection { selection } => { + let column_chunks = data_request.into_dense_column_chunks(); + // The current row group already computed a pushdown selection. Apply that + // selection after decode instead of evaluating the predicates again. + // + // Sparse predicate chunks may not cover the base selection. Dense chunks + // are safe to reuse and preserve predicate-cache IO behavior. + return self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + column_chunks, + ); } - - self.metrics - .record_fallback_row_group(RowGroupExecutionMode::Pushdown( - decision.strategy, - )); } let needed_ranges = data_request.needed_ranges(&self.buffers); From d31e805d9abdeec3e27018ffac44a05f6fb92d95 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sat, 16 May 2026 21:39:36 +0800 Subject: [PATCH 09/32] fix(parquet): address row filter CI failures --- ...6-05-16-row-filter-fallback-readability.md | 308 ------------------ ...-row-filter-fallback-readability-design.md | 107 ------ parquet/src/arrow/push_decoder/mod.rs | 22 +- .../arrow/push_decoder/reader_builder/mod.rs | 8 +- parquet/src/arrow/push_decoder/remaining.rs | 2 +- 5 files changed, 23 insertions(+), 424 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md delete mode 100644 docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md diff --git a/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md b/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md deleted file mode 100644 index 8afeb0a60711..000000000000 --- a/docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md +++ /dev/null @@ -1,308 +0,0 @@ -# Row Filter Fallback Readability Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Reduce the cognitive load in the async Parquet row-filter fallback state machine without changing behavior. - -**Architecture:** Keep the existing module layout. Add a small private transition enum and two private helper methods inside `push_decoder/reader_builder/mod.rs`, then replace the inline fallback block in the `WaitingOnData` state with a short high-level match. - -**Tech Stack:** Rust, Apache Arrow Rust Parquet reader internals, existing `cargo test` and `cargo fmt`. - ---- - -## File Structure - -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs` - - Add `FallbackTransition`. - - Clarify the `post_filter` field comment. - - Add `ensure_post_filter_state`. - - Add `resolve_fallback_transition`. - - Replace the inline fallback block in `RowGroupDecoderState::WaitingOnData`. -- No new production modules. -- No new tests required because this is behavior-preserving refactoring; existing unit and async row-filter tests cover the current behavior. - -## Task 1: Add Transition Type and Post-Filter State Helper - -**Files:** -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:54-62` -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:272-276` -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:352-356` - -- [ ] **Step 1: Add `FallbackTransition` after `RowGroupInfo`** - -Insert this enum immediately after `RowGroupInfo`: - -```rust -enum FallbackTransition { - ContinuePushdown, - StartPostSelection { selection: RowSelection }, - EnablePostFilter, -} -``` - -- [ ] **Step 2: Clarify the `post_filter` field comment** - -Replace the existing field comment: - -```rust -/// Shared filter state used once Auto fallback switches to post-filter. -post_filter: Option>>, -``` - -with: - -```rust -/// Predicate state reused by later row groups once Auto fallback switches to post-filter. -post_filter: Option>>, -``` - -- [ ] **Step 3: Add `ensure_post_filter_state`** - -Add this private method in `impl RowGroupReaderBuilder`, near `disable_post_filter_fallback`: - -```rust -fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { - if self.post_filter.is_some() { - return Ok(()); - } - - let filter = self.filter.take().ok_or_else(|| { - ParquetError::General("post-filter fallback selected without a row filter".to_string()) - })?; - self.post_filter = Some(Arc::new(Mutex::new(filter))); - Ok(()) -} -``` - -- [ ] **Step 4: Run focused formatting check** - -Run: - -```bash -cargo fmt --all --check -``` - -Expected: this may fail until all refactor steps are complete. If it fails only because the new code needs formatting, continue and run `cargo fmt --all` after Task 3. - -## Task 2: Extract Fallback Transition Resolution - -**Files:** -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:358-378` -- Reads existing logic at: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:892-943` - -- [ ] **Step 1: Add `resolve_fallback_transition`** - -Add this private method in `impl RowGroupReaderBuilder`, after `ensure_post_filter_state`: - -```rust -fn resolve_fallback_transition( - &mut self, - row_group_info: &RowGroupInfo, - cache_info: Option<&CacheInfo>, -) -> Result { - if cache_info.is_none() - || !matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) - || !self.post_filter_fallback_supported(row_group_info.budget) - { - return Ok(FallbackTransition::ContinuePushdown); - } - - let decision = row_group_info - .plan_builder - .resolve_selection_strategy_decision(); - let observed_selection = row_group_info.plan_builder.selection().cloned(); - - self.observe_fallback_candidate(decision, row_group_info.row_count, row_group_info.budget); - - if matches!( - self.fallback_state, - RowGroupFallbackState::UsePostFilter { .. } - ) { - if row_group_info.base_selection.is_none() { - let selection = observed_selection.unwrap_or_else(|| { - RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) - }); - return Ok(FallbackTransition::StartPostSelection { selection }); - } - - self.ensure_post_filter_state()?; - self.metrics - .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - return Ok(FallbackTransition::EnablePostFilter); - } - - self.metrics - .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - Ok(FallbackTransition::ContinuePushdown) -} -``` - -- [ ] **Step 2: Verify behavior preservation in the helper** - -Check these details manually before moving on: - -- `StartPostSelection` returns before recording `fallback_pushdown_row_group_count`, matching the old early return. -- `EnablePostFilter` records `Pushdown(decision.strategy)`, matching the old fall-through behavior. -- `ContinuePushdown` records the same metric only after an observation was made. -- The helper does not consume `data_request` or `cache_info`. - -## Task 3: Replace the Inline `WaitingOnData` Fallback Block - -**Files:** -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs:887-945` - -- [ ] **Step 1: Replace the inline fallback block** - -Replace the entire initial fallback-observation block at the top of the `WaitingOnData` arm. It starts with: - -```rust -if cache_info.is_some() - && matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) - && self.post_filter_fallback_supported(row_group_info.budget) -{ -} -``` - -and ends immediately before: - -```rust -let needed_ranges = data_request.needed_ranges(&self.buffers); -``` - -with: - -```rust -match self.resolve_fallback_transition(&row_group_info, cache_info.as_ref())? { - FallbackTransition::ContinuePushdown | FallbackTransition::EnablePostFilter => {} - FallbackTransition::StartPostSelection { selection } => { - let column_chunks = data_request.into_dense_column_chunks(); - // The current row group already computed a pushdown selection. Apply that - // selection after decode instead of evaluating the predicates again. - // - // Sparse predicate chunks may not cover the base selection. Dense chunks - // are safe to reuse and preserve predicate-cache IO behavior. - return self.start_post_selection_filter( - row_group_info, - selection, - cache_info, - column_chunks, - ); - } -} -``` - -- [ ] **Step 2: Run formatter** - -Run: - -```bash -cargo fmt --all -``` - -Expected: command exits 0. - -- [ ] **Step 3: Inspect the resulting `WaitingOnData` branch** - -Run: - -```bash -nl -ba parquet/src/arrow/push_decoder/reader_builder/mod.rs | sed -n '887,945p' -``` - -Expected: the branch reads in this order: - -1. `resolve_fallback_transition` match. -2. `needed_ranges` check. -3. row group destructuring. -4. normal row group reader build. - -## Task 4: Verification - -**Files:** -- Verify only; no source edits expected. - -- [ ] **Step 1: Run push decoder reader builder tests** - -Run: - -```bash -cargo test -p parquet --lib arrow::push_decoder::reader_builder::tests -``` - -Expected: all tests pass. - -- [ ] **Step 2: Run async row-filter integration tests** - -Run: - -```bash -cargo test -p parquet --test arrow_reader --features arrow,async row_filter::async -``` - -Expected: all matching async row-filter tests pass. If this command does not match tests in this workspace, run: - -```bash -cargo test -p parquet --test arrow_reader --features arrow,async row_filter -``` - -and report the exact command and result. - -- [ ] **Step 3: Run formatting check** - -Run: - -```bash -cargo fmt --all --check -``` - -Expected: command exits 0. - -- [ ] **Step 4: Review diff** - -Run: - -```bash -git diff -- parquet/src/arrow/push_decoder/reader_builder/mod.rs -``` - -Expected: diff only introduces the transition enum, helper methods, comment improvements, and the shorter `WaitingOnData` fallback match. - -## Task 5: Commit - -**Files:** -- Modify: `parquet/src/arrow/push_decoder/reader_builder/mod.rs` -- Include this plan file only if the team wants planning artifacts committed. - -- [ ] **Step 1: Check status** - -Run: - -```bash -git status --short -``` - -Expected: only intentional files are changed. - -- [ ] **Step 2: Stage implementation** - -Run: - -```bash -git add parquet/src/arrow/push_decoder/reader_builder/mod.rs -``` - -If committing the plan file, also run: - -```bash -git add docs/superpowers/plans/2026-05-16-row-filter-fallback-readability.md -``` - -- [ ] **Step 3: Commit** - -Run: - -```bash -git commit -m "refactor(parquet): clarify row filter fallback transition" -``` - -Expected: commit succeeds. diff --git a/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md b/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md deleted file mode 100644 index 0c55fe3a67c9..000000000000 --- a/docs/superpowers/specs/2026-05-16-row-filter-fallback-readability-design.md +++ /dev/null @@ -1,107 +0,0 @@ -# Row Filter Fallback Readability Refactor Design - -## Context - -The current branch adds adaptive post-filter fallback for the async Parquet row-filter path. The core design is already split into focused modules: - -- `arrow_reader/post_filter.rs` owns post-decode filtering. -- `push_decoder/reader_builder/fallback.rs` owns fallback eligibility and observation. -- `push_decoder/reader_builder/selection_policy.rs` owns row-selection policy resolution. - -The remaining readability issue is in `push_decoder/reader_builder/mod.rs`, especially the `WaitingOnData` branch. That branch currently mixes state-machine progression with fallback observation, metrics recording, post-selection handoff, and post-filter state initialization. - -## Goal - -Reduce the cognitive load in `push_decoder/reader_builder/mod.rs` without changing runtime behavior. - -This refactor should make the row-group state machine read in a high-level order: - -1. Resolve any fallback transition. -2. Return early if the current row group should switch to post-selection filtering. -3. Wait for missing data if necessary. -4. Build the normal pushdown reader. - -## Non-Goals - -- Do not change fallback heuristics or thresholds. -- Do not change metrics semantics or counter names. -- Do not change benchmark behavior. -- Do not move the full fallback implementation into a new module in this step. -- Do not broaden fallback eligibility. - -## Proposed Design - -Add a small private transition enum near `RowGroupReaderBuilder`: - -```rust -enum FallbackTransition { - ContinuePushdown, - StartPostSelection { - selection: RowSelection, - }, - EnablePostFilter, -} -``` - -Add two private helper methods on `RowGroupReaderBuilder`: - -```rust -fn resolve_fallback_transition( - &mut self, - row_group_info: &RowGroupInfo, - cache_info: Option<&CacheInfo>, -) -> Result -``` - -```rust -fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> -``` - -`resolve_fallback_transition` owns the current fallback block from the `WaitingOnData` branch: - -- Check whether fallback observation is active and supported. -- Resolve the `RowSelectionStrategyDecision`. -- Capture the observed pushdown selection. -- Call `observe_fallback_candidate`. -- If fallback switches on and there is no base selection, return `StartPostSelection`. -- If fallback switches on and there is a base selection, initialize post-filter state and return `EnablePostFilter`. -- Otherwise record the pushdown fallback metric and return `ContinuePushdown`. - -The helper should not consume `data_request` or `cache_info`. If it returns `StartPostSelection`, the caller remains responsible for moving the existing values into: - -```rust -self.start_post_selection_filter( - row_group_info, - selection, - cache_info, - data_request.into_dense_column_chunks(), -) -``` - -`ensure_post_filter_state` owns only the state transition from `self.filter` into `self.post_filter`. This keeps the main state-machine branch from handling predicate-state ownership directly. - -## Naming and Comments - -While extracting the helper, improve local names without broad churn: - -- Rename `fallback_selection` to `observed_selection` or `pushdown_selection`. -- Clarify that the `post_filter` field stores predicate state reused by later row groups after fallback selects post-filter execution. -- Add a short comment before `StartPostSelection` explaining that the current row group already computed a selection, so it applies that selection after decode instead of re-running predicates. - -## Expected Result - -The `WaitingOnData` branch should become shorter and easier to scan. Fallback details should be readable in one helper, while the main branch remains focused on the row-group state machine. - -The refactor should preserve the existing module structure and keep the diff reviewable. - -## Verification - -Run these commands after implementation: - -```bash -cargo test -p parquet --lib arrow::push_decoder::reader_builder::tests -cargo test -p parquet --test arrow_reader --features arrow,async row_filter::async -cargo fmt --all --check -``` - -If the second command is too broad or the local test target names differ, run the closest existing async row-filter integration test command and report the exact command used. diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 055b660e7647..0463e914a1f4 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1352,10 +1352,17 @@ mod test { }, ); + let mut row_selection = Vec::with_capacity(101); + for _ in 0..50 { + row_selection.push(RowSelector::select(1)); + row_selection.push(RowSelector::skip(1)); + } + row_selection.push(RowSelector::select(100)); + let mut decoder = builder .with_batch_size(100) .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) - .with_row_selection(RowSelection::from(vec![RowSelector::select(400)])) + .with_row_selection(RowSelection::from(row_selection)) .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) .with_metrics(metrics.clone()) @@ -1365,13 +1372,13 @@ mod test { let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!( predicate_rows.load(Ordering::Relaxed), - 100, + 50, "fallback observation must not re-run the predicate for the same row group" ); - assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[2]).unwrap()); + assert_eq!(batch, expected_c_every_other(0, 100)); let batch = next_batch_with_data(&mut decoder, data).unwrap(); - assert_eq!(predicate_rows.load(Ordering::Relaxed), 200); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 150); assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); @@ -2001,6 +2008,13 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_c_every_other(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = BooleanArray::from((0..len).map(|idx| idx % 2 == 0).collect::>()); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn next_reader_with_data( decoder: &mut ParquetPushDecoder, data: &Bytes, diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 3cdf43084040..fb196d32e24b 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -219,7 +219,7 @@ pub(crate) enum RowGroupBuildResult { NeedsData(Vec>), /// The active row group produced a reader. Data { - batch_reader: ParquetRecordBatchReader, + batch_reader: Box, /// Budget remaining after applying this row group's selection. remaining_budget: RowBudget, }, @@ -870,7 +870,7 @@ impl RowGroupReaderBuilder { NextState::result( RowGroupDecoderState::Finished, RowGroupBuildResult::Data { - batch_reader: reader, + batch_reader: Box::new(reader), remaining_budget: budget, }, ) @@ -936,7 +936,7 @@ impl RowGroupReaderBuilder { NextState::result( RowGroupDecoderState::Finished, RowGroupBuildResult::Data { - batch_reader: reader, + batch_reader: Box::new(reader), remaining_budget: budget, }, ) @@ -1016,7 +1016,7 @@ impl RowGroupReaderBuilder { NextState::result( RowGroupDecoderState::Finished, RowGroupBuildResult::Data { - batch_reader: reader, + batch_reader: Box::new(reader), remaining_budget: budget, }, ) diff --git a/parquet/src/arrow/push_decoder/remaining.rs b/parquet/src/arrow/push_decoder/remaining.rs index 3b73f310666c..c65b0d9c62fe 100644 --- a/parquet/src/arrow/push_decoder/remaining.rs +++ b/parquet/src/arrow/push_decoder/remaining.rs @@ -280,7 +280,7 @@ impl RemainingRowGroups { self.frontier .update_budget_after_row_group(remaining_budget); // ready to read the row group - return Ok(DecodeResult::Data(batch_reader)); + return Ok(DecodeResult::Data(*batch_reader)); } } } From bbf706421584c9e646da46f76c048c6c13860a62 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 17 May 2026 14:30:51 +0800 Subject: [PATCH 10/32] refactor(parquet): frame auto post-filter as cost model --- parquet/benches/arrow_reader_row_filter.rs | 10 +- parquet/src/arrow/arrow_reader/metrics.rs | 282 +++++++++++++----- parquet/src/arrow/arrow_reader/mod.rs | 45 ++- parquet/src/arrow/arrow_reader/post_filter.rs | 62 ++-- parquet/src/arrow/arrow_reader/read_plan.rs | 89 +++--- parquet/src/arrow/arrow_reader/selection.rs | 30 +- parquet/src/arrow/push_decoder/mod.rs | 269 ++++++++++++++--- .../{fallback.rs => cost_model.rs} | 106 +++++-- .../arrow/push_decoder/reader_builder/data.rs | 2 +- .../arrow/push_decoder/reader_builder/mod.rs | 262 +++++++++------- parquet/src/arrow/push_decoder/remaining.rs | 7 +- 11 files changed, 821 insertions(+), 343 deletions(-) rename parquet/src/arrow/push_decoder/reader_builder/{fallback.rs => cost_model.rs} (63%) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index b20781d82f2b..46ab8551d36b 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -244,7 +244,7 @@ impl std::fmt::Display for SyncStrategy { #[derive(Clone, Copy)] enum AsyncStrategy { FullPostFilter, - PushdownAutoFallback, + PushdownAutoCostModel, PushdownSelectors, PushdownMask, } @@ -253,7 +253,7 @@ impl std::fmt::Display for AsyncStrategy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { AsyncStrategy::FullPostFilter => write!(f, "full_post_filter"), - AsyncStrategy::PushdownAutoFallback => write!(f, "pushdown_auto_fallback"), + AsyncStrategy::PushdownAutoCostModel => write!(f, "pushdown_auto_cost_model"), AsyncStrategy::PushdownSelectors => write!(f, "pushdown_selectors"), AsyncStrategy::PushdownMask => write!(f, "pushdown_mask"), } @@ -644,7 +644,7 @@ fn benchmark_sync_strategy_matrix(c: &mut Criterion) { } /// Compare async full scan plus post-filtering against async row-level pushdown -/// strategies. This is the matrix that exercises reader `Auto` fallback because +/// strategies. This is the matrix that exercises reader `Auto` cost modeling because /// the async stream is backed by the push decoder row-group pipeline. fn benchmark_async_strategy_matrix(c: &mut Criterion) { let parquet_file = Bytes::from(write_parquet_file()); @@ -656,7 +656,7 @@ fn benchmark_async_strategy_matrix(c: &mut Criterion) { ]; let strategies = [ AsyncStrategy::FullPostFilter, - AsyncStrategy::PushdownAutoFallback, + AsyncStrategy::PushdownAutoCostModel, AsyncStrategy::PushdownSelectors, AsyncStrategy::PushdownMask, ]; @@ -712,7 +712,7 @@ fn benchmark_async_strategy_matrix(c: &mut Criterion) { ) .await } - AsyncStrategy::PushdownAutoFallback => { + AsyncStrategy::PushdownAutoCostModel => { let row_filter = row_filter_for(filter_type, pred_mask); benchmark_async_reader_with_policy( reader, diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index 72ac792beb3a..382e26cb5828 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -18,11 +18,86 @@ //! [ArrowReaderMetrics] for collecting metrics about the Arrow reader use crate::arrow::arrow_reader::selection::{ - FallbackTriggerReason, RowGroupExecutionMode, RowSelectionStrategyDecision, + CostModelDecisionReason, RowGroupExecutionMode, RowSelectionStrategyDecision, RowSelectionStrategyReason, }; use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::time::{Duration, Instant}; + +#[derive(Clone, Copy, Debug)] +pub(crate) enum ArrowReaderPhase { + PredicateRangePlanning, + PredicateDecode, + PredicateEvaluate, + PredicateSelectionBuild, + PredicateSelectionMerge, + OutputRangePlanning, + OutputSelectionResolve, + OutputMaskFilter, + PostFilterPredicateProject, + PostFilterPredicateEvaluate, + PostFilterApplyFilter, + PostFilterOutputProject, + PostSelectionApplyFilter, +} + +impl ArrowReaderPhase { + const COUNT: usize = 13; + #[cfg(test)] + const ALL: [Self; Self::COUNT] = [ + Self::PredicateRangePlanning, + Self::PredicateDecode, + Self::PredicateEvaluate, + Self::PredicateSelectionBuild, + Self::PredicateSelectionMerge, + Self::OutputRangePlanning, + Self::OutputSelectionResolve, + Self::OutputMaskFilter, + Self::PostFilterPredicateProject, + Self::PostFilterPredicateEvaluate, + Self::PostFilterApplyFilter, + Self::PostFilterOutputProject, + Self::PostSelectionApplyFilter, + ]; + + fn index(self) -> usize { + match self { + Self::PredicateRangePlanning => 0, + Self::PredicateDecode => 1, + Self::PredicateEvaluate => 2, + Self::PredicateSelectionBuild => 3, + Self::PredicateSelectionMerge => 4, + Self::OutputRangePlanning => 5, + Self::OutputSelectionResolve => 6, + Self::OutputMaskFilter => 7, + Self::PostFilterPredicateProject => 8, + Self::PostFilterPredicateEvaluate => 9, + Self::PostFilterApplyFilter => 10, + Self::PostFilterOutputProject => 11, + Self::PostSelectionApplyFilter => 12, + } + } + + #[cfg(test)] + fn name(self) -> &'static str { + match self { + Self::PredicateRangePlanning => "predicate_range_planning", + Self::PredicateDecode => "predicate_decode", + Self::PredicateEvaluate => "predicate_evaluate", + Self::PredicateSelectionBuild => "predicate_selection_build", + Self::PredicateSelectionMerge => "predicate_selection_merge", + Self::OutputRangePlanning => "output_range_planning", + Self::OutputSelectionResolve => "output_selection_resolve", + Self::OutputMaskFilter => "output_mask_filter", + Self::PostFilterPredicateProject => "post_filter_predicate_project", + Self::PostFilterPredicateEvaluate => "post_filter_predicate_evaluate", + Self::PostFilterApplyFilter => "post_filter_apply_filter", + Self::PostFilterOutputProject => "post_filter_output_project", + Self::PostSelectionApplyFilter => "post_selection_apply_filter", + } + } +} /// This enum represents the state of Arrow reader metrics collection. /// @@ -49,7 +124,12 @@ impl ArrowReaderMetrics { /// Creates a new instance of [`ArrowReaderMetrics::Enabled`] pub fn enabled() -> Self { - Self::Enabled(Arc::new(ArrowReaderMetricsInner::new())) + Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(false))) + } + + #[cfg(test)] + pub(crate) fn enabled_with_phase_profile() -> Self { + Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(true))) } /// Predicate Cache: number of records read directly from the inner reader @@ -165,49 +245,49 @@ impl ArrowReaderMetrics { self.load(|inner| &inner.row_selection_auto_selector_long_run_plan_count) } - /// Fallback: number of row groups included in the observation window - pub fn fallback_observed_row_group_count(&self) -> Option { - self.load(|inner| &inner.fallback_observed_row_group_count) + /// Cost model: number of row groups included in the observation window + pub fn cost_model_observed_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_observed_row_group_count) } - /// Fallback: number of row groups executed with pushdown - pub fn fallback_pushdown_row_group_count(&self) -> Option { - self.load(|inner| &inner.fallback_pushdown_row_group_count) + /// Cost model: number of row groups executed with pushdown + pub fn cost_model_pushdown_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_pushdown_row_group_count) } - /// Fallback: number of row groups executed with post-filter - pub fn fallback_post_filter_row_group_count(&self) -> Option { - self.load(|inner| &inner.fallback_post_filter_row_group_count) + /// Cost model: number of row groups executed with post-filter + pub fn cost_model_post_filter_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_post_filter_row_group_count) } - /// Fallback: number of times fallback was disabled by a forced policy - pub fn fallback_forced_policy_count(&self) -> Option { - self.load(|inner| &inner.fallback_forced_policy_count) + /// Cost model: number of times cost modeling was disabled by a forced policy + pub fn cost_model_forced_policy_count(&self) -> Option { + self.load(|inner| &inner.cost_model_forced_policy_count) } - /// Fallback: number of incomplete observation-window decisions - pub fn fallback_observation_incomplete_count(&self) -> Option { - self.load(|inner| &inner.fallback_observation_incomplete_count) + /// Cost model: number of incomplete observation-window decisions + pub fn cost_model_observation_incomplete_count(&self) -> Option { + self.load(|inner| &inner.cost_model_observation_incomplete_count) } - /// Fallback: number of times pushdown remained preferred - pub fn fallback_pushdown_still_preferred_count(&self) -> Option { - self.load(|inner| &inner.fallback_pushdown_still_preferred_count) + /// Cost model: number of times pushdown remained preferred + pub fn cost_model_pushdown_still_preferred_count(&self) -> Option { + self.load(|inner| &inner.cost_model_pushdown_still_preferred_count) } - /// Fallback: number of high-selectivity no-pruning triggers - pub fn fallback_high_selectivity_no_pruning_count(&self) -> Option { - self.load(|inner| &inner.fallback_high_selectivity_no_pruning_count) + /// Cost model: number of high-selectivity no-pruning triggers + pub fn cost_model_high_selectivity_no_pruning_count(&self) -> Option { + self.load(|inner| &inner.cost_model_high_selectivity_no_pruning_count) } - /// Fallback: number of fragmented moderate-selectivity triggers - pub fn fallback_fragmented_moderate_selectivity_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_moderate_selectivity_count) + /// Cost model: number of fragmented moderate-selectivity triggers + pub fn cost_model_fragmented_moderate_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_fragmented_moderate_selectivity_count) } - /// Fallback: number of fragmented high-selectivity triggers - pub fn fallback_fragmented_high_selectivity_count(&self) -> Option { - self.load(|inner| &inner.fallback_fragmented_high_selectivity_count) + /// Cost model: number of fragmented high-selectivity triggers + pub fn cost_model_fragmented_high_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_fragmented_high_selectivity_count) } /// Increments the count of records read from the inner reader @@ -287,53 +367,95 @@ impl ArrowReaderMetrics { decision_count.fetch_add(1, Ordering::Relaxed); } - pub(crate) fn record_fallback_observed_row_group(&self) { + pub(crate) fn record_cost_model_observed_row_group(&self) { let Self::Enabled(inner) = self else { return; }; inner - .fallback_observed_row_group_count + .cost_model_observed_row_group_count .fetch_add(1, Ordering::Relaxed); } - pub(crate) fn record_fallback_row_group(&self, mode: RowGroupExecutionMode) { + pub(crate) fn record_cost_model_row_group(&self, mode: RowGroupExecutionMode) { let Self::Enabled(inner) = self else { return; }; let counter = match mode { - RowGroupExecutionMode::Pushdown(_) => &inner.fallback_pushdown_row_group_count, - RowGroupExecutionMode::PostFilter => &inner.fallback_post_filter_row_group_count, + RowGroupExecutionMode::Pushdown(_) => &inner.cost_model_pushdown_row_group_count, + RowGroupExecutionMode::PostFilter => &inner.cost_model_post_filter_row_group_count, }; counter.fetch_add(1, Ordering::Relaxed); } - pub(crate) fn record_fallback_trigger(&self, reason: FallbackTriggerReason) { + pub(crate) fn record_cost_model_trigger(&self, reason: CostModelDecisionReason) { let Self::Enabled(inner) = self else { return; }; let counter = match reason { - FallbackTriggerReason::HighSelectivityNoPruning => { - &inner.fallback_high_selectivity_no_pruning_count + CostModelDecisionReason::HighSelectivityNoPruning => { + &inner.cost_model_high_selectivity_no_pruning_count } - FallbackTriggerReason::FragmentedModerateSelectivity => { - &inner.fallback_fragmented_moderate_selectivity_count + CostModelDecisionReason::FragmentedModerateSelectivity => { + &inner.cost_model_fragmented_moderate_selectivity_count } - FallbackTriggerReason::FragmentedHighSelectivity => { - &inner.fallback_fragmented_high_selectivity_count + CostModelDecisionReason::FragmentedHighSelectivity => { + &inner.cost_model_fragmented_high_selectivity_count } - FallbackTriggerReason::ObservationIncomplete => { - &inner.fallback_observation_incomplete_count + CostModelDecisionReason::ObservationIncomplete => { + &inner.cost_model_observation_incomplete_count } - FallbackTriggerReason::PushdownStillPreferred => { - &inner.fallback_pushdown_still_preferred_count + CostModelDecisionReason::PushdownStillPreferred => { + &inner.cost_model_pushdown_still_preferred_count } - FallbackTriggerReason::ForcedPolicy => &inner.fallback_forced_policy_count, + CostModelDecisionReason::ForcedPolicy => &inner.cost_model_forced_policy_count, }; counter.fetch_add(1, Ordering::Relaxed); } + pub(crate) fn time_phase(&self, phase: ArrowReaderPhase, f: impl FnOnce() -> T) -> T { + let Self::Enabled(inner) = self else { + return f(); + }; + if !inner.phase_profile_enabled { + return f(); + } + + let start = Instant::now(); + let result = f(); + inner.record_phase(phase, start.elapsed()); + result + } + + #[cfg(test)] + pub(crate) fn phase_profile_report(&self) -> Option { + let Self::Enabled(inner) = self else { + return None; + }; + if !inner.phase_profile_enabled { + return None; + } + + let mut lines = vec!["phase,total_ms,count,avg_us".to_string()]; + for phase in ArrowReaderPhase::ALL { + let idx = phase.index(); + let total_ns = inner.phase_ns[idx].load(Ordering::Relaxed); + let count = inner.phase_counts[idx].load(Ordering::Relaxed); + if count == 0 { + continue; + } + + let total_ms = total_ns as f64 / 1_000_000.0; + let avg_us = total_ns as f64 / count as f64 / 1_000.0; + lines.push(format!( + "{},{total_ms:.3},{count},{avg_us:.3}", + phase.name() + )); + } + Some(lines.join("\n")) + } + fn load(&self, metric: fn(&ArrowReaderMetricsInner) -> &AtomicUsize) -> Option { match self { Self::Disabled => None, @@ -382,29 +504,32 @@ pub struct ArrowReaderMetricsInner { row_selection_auto_selector_clustered_plan_count: AtomicUsize, /// Number of Auto plans choosing selectors for long runs row_selection_auto_selector_long_run_plan_count: AtomicUsize, - /// Number of row groups included in fallback observation - fallback_observed_row_group_count: AtomicUsize, - /// Number of fallback-capable row groups executed with pushdown - fallback_pushdown_row_group_count: AtomicUsize, + /// Number of row groups included in cost-model observation + cost_model_observed_row_group_count: AtomicUsize, + /// Number of cost-model eligible row groups executed with pushdown + cost_model_pushdown_row_group_count: AtomicUsize, /// Number of row groups executed with post-filter - fallback_post_filter_row_group_count: AtomicUsize, - /// Number of fallback decisions disabled by forced policy - fallback_forced_policy_count: AtomicUsize, - /// Number of incomplete fallback observations - fallback_observation_incomplete_count: AtomicUsize, - /// Number of fallback decisions that kept pushdown - fallback_pushdown_still_preferred_count: AtomicUsize, - /// Number of high-selectivity no-pruning fallback triggers - fallback_high_selectivity_no_pruning_count: AtomicUsize, - /// Number of fragmented moderate-selectivity fallback triggers - fallback_fragmented_moderate_selectivity_count: AtomicUsize, - /// Number of fragmented high-selectivity fallback triggers - fallback_fragmented_high_selectivity_count: AtomicUsize, + cost_model_post_filter_row_group_count: AtomicUsize, + /// Number of cost-model decisions disabled by forced policy + cost_model_forced_policy_count: AtomicUsize, + /// Number of incomplete cost-model observations + cost_model_observation_incomplete_count: AtomicUsize, + /// Number of cost-model decisions that kept pushdown + cost_model_pushdown_still_preferred_count: AtomicUsize, + /// Number of high-selectivity no-pruning cost-model triggers + cost_model_high_selectivity_no_pruning_count: AtomicUsize, + /// Number of fragmented moderate-selectivity cost-model triggers + cost_model_fragmented_moderate_selectivity_count: AtomicUsize, + /// Number of fragmented high-selectivity cost-model triggers + cost_model_fragmented_high_selectivity_count: AtomicUsize, + phase_profile_enabled: bool, + phase_ns: [AtomicU64; ArrowReaderPhase::COUNT], + phase_counts: [AtomicUsize; ArrowReaderPhase::COUNT], } impl ArrowReaderMetricsInner { /// Creates a new instance of `ArrowReaderMetricsInner` - pub(crate) fn new() -> Self { + pub(crate) fn new(phase_profile_enabled: bool) -> Self { Self { records_read_from_inner: AtomicUsize::new(0), records_read_from_cache: AtomicUsize::new(0), @@ -423,15 +548,24 @@ impl ArrowReaderMetricsInner { row_selection_auto_mask_high_ratio_plan_count: AtomicUsize::new(0), row_selection_auto_selector_clustered_plan_count: AtomicUsize::new(0), row_selection_auto_selector_long_run_plan_count: AtomicUsize::new(0), - fallback_observed_row_group_count: AtomicUsize::new(0), - fallback_pushdown_row_group_count: AtomicUsize::new(0), - fallback_post_filter_row_group_count: AtomicUsize::new(0), - fallback_forced_policy_count: AtomicUsize::new(0), - fallback_observation_incomplete_count: AtomicUsize::new(0), - fallback_pushdown_still_preferred_count: AtomicUsize::new(0), - fallback_high_selectivity_no_pruning_count: AtomicUsize::new(0), - fallback_fragmented_moderate_selectivity_count: AtomicUsize::new(0), - fallback_fragmented_high_selectivity_count: AtomicUsize::new(0), + cost_model_observed_row_group_count: AtomicUsize::new(0), + cost_model_pushdown_row_group_count: AtomicUsize::new(0), + cost_model_post_filter_row_group_count: AtomicUsize::new(0), + cost_model_forced_policy_count: AtomicUsize::new(0), + cost_model_observation_incomplete_count: AtomicUsize::new(0), + cost_model_pushdown_still_preferred_count: AtomicUsize::new(0), + cost_model_high_selectivity_no_pruning_count: AtomicUsize::new(0), + cost_model_fragmented_moderate_selectivity_count: AtomicUsize::new(0), + cost_model_fragmented_high_selectivity_count: AtomicUsize::new(0), + phase_profile_enabled, + phase_ns: std::array::from_fn(|_| AtomicU64::new(0)), + phase_counts: std::array::from_fn(|_| AtomicUsize::new(0)), } } + + fn record_phase(&self, phase: ArrowReaderPhase, duration: Duration) { + let idx = phase.index(); + self.phase_ns[idx].fetch_add(duration.as_nanos() as u64, Ordering::Relaxed); + self.phase_counts[idx].fetch_add(1, Ordering::Relaxed); + } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 9877417f0884..c02789be6fa0 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -50,7 +50,7 @@ use crate::file::metadata::{ use crate::file::reader::{ChunkReader, SerializedPageReader}; use crate::schema::types::SchemaDescriptor; -use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; // Exposed so integration tests and benchmarks can temporarily override the threshold. pub use read_plan::{PredicateOptions, ReadPlan, ReadPlanBuilder}; @@ -1228,7 +1228,10 @@ impl ParquetRecordBatchReaderBuilder { .with_parquet_metadata(&reader.metadata) .build_array_reader(fields.as_deref(), predicate.projection())?; - plan_builder = plan_builder.with_predicate(array_reader, predicate.as_mut())?; + plan_builder = plan_builder.with_predicate_options( + PredicateOptions::new(array_reader, predicate.as_mut()) + .with_metrics(metrics.clone()), + )?; } } @@ -1244,7 +1247,11 @@ impl ParquetRecordBatchReaderBuilder { .build_limited() .build_with_metrics(&metrics); - Ok(ParquetRecordBatchReader::new(array_reader, read_plan)) + Ok(ParquetRecordBatchReader::new_with_metrics( + array_reader, + read_plan, + metrics, + )) } } @@ -1347,6 +1354,7 @@ pub struct ParquetRecordBatchReader { array_reader_position: usize, schema: SchemaRef, read_plan: ReadPlan, + metrics: ArrowReaderMetrics, post_filter: Option, post_selection_filter: Option, buffered_batches: Option>, @@ -1484,8 +1492,11 @@ impl ParquetRecordBatchReader { ) })?; - let filtered_batch = - filter_record_batch(&RecordBatch::from(struct_array), &mask)?; + let filtered_batch = self + .metrics + .time_phase(ArrowReaderPhase::OutputMaskFilter, || { + filter_record_batch(&RecordBatch::from(struct_array), &mask) + })?; if filtered_batch.num_rows() != selected_rows { return Err(general_err!( @@ -1568,8 +1579,11 @@ impl ParquetRecordBatchReader { ) })?; - let filtered_batch = - filter_record_batch(&RecordBatch::from(struct_array), &mask)?; + let filtered_batch = self + .metrics + .time_phase(ArrowReaderPhase::OutputMaskFilter, || { + filter_record_batch(&RecordBatch::from(struct_array), &mask) + })?; if filtered_batch.num_rows() != mask_chunk.selected_rows { return Err(general_err!( @@ -1709,6 +1723,7 @@ impl ParquetRecordBatchReader { array_reader_position: 0, schema: Arc::new(Schema::new(levels.fields.clone())), read_plan, + metrics, post_filter: None, post_selection_filter: None, buffered_batches: None, @@ -1719,6 +1734,14 @@ impl ParquetRecordBatchReader { /// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` /// all rows will be returned pub(crate) fn new(array_reader: Box, read_plan: ReadPlan) -> Self { + Self::new_with_metrics(array_reader, read_plan, ArrowReaderMetrics::disabled()) + } + + pub(crate) fn new_with_metrics( + array_reader: Box, + read_plan: ReadPlan, + metrics: ArrowReaderMetrics, + ) -> Self { let schema = match array_reader.get_data_type() { ArrowType::Struct(fields) => Schema::new(fields.clone()), _ => unreachable!("Struct array reader's data type is not struct!"), @@ -1729,6 +1752,7 @@ impl ParquetRecordBatchReader { array_reader_position: 0, schema: Arc::new(schema), read_plan, + metrics, post_filter: None, post_selection_filter: None, buffered_batches: None, @@ -1739,6 +1763,7 @@ impl ParquetRecordBatchReader { array_reader: Box, read_plan: ReadPlan, selection: RowSelection, + metrics: ArrowReaderMetrics, ) -> Self { let schema = match array_reader.get_data_type() { ArrowType::Struct(fields) => Schema::new(fields.clone()), @@ -1750,8 +1775,9 @@ impl ParquetRecordBatchReader { array_reader_position: 0, schema: Arc::new(schema), read_plan, + metrics: metrics.clone(), post_filter: None, - post_selection_filter: Some(PostSelectionFilterState::new(selection)), + post_selection_filter: Some(PostSelectionFilterState::new(selection, metrics)), buffered_batches: None, } } @@ -1763,6 +1789,7 @@ impl ParquetRecordBatchReader { parquet_schema: &SchemaDescriptor, read_projection: &ProjectionMask, output_projection: &ProjectionMask, + metrics: ArrowReaderMetrics, ) -> Result { let read_schema = match array_reader.get_data_type() { ArrowType::Struct(fields) => Schema::new(fields.clone()), @@ -1770,6 +1797,7 @@ impl ParquetRecordBatchReader { }; let post_filter = PostFilterState::try_new( filter, + metrics.clone(), parquet_schema, &read_schema, read_projection, @@ -1782,6 +1810,7 @@ impl ParquetRecordBatchReader { array_reader_position: 0, schema, read_plan, + metrics, post_filter: Some(post_filter), post_selection_filter: None, buffered_batches: None, diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs index 7e533703c4e7..a6777a337368 100644 --- a/parquet/src/arrow/arrow_reader/post_filter.rs +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -//! Post-decode filtering support for parquet row-filter fallback. +//! Post-decode filtering support for parquet row filters. //! //! Normal predicate pushdown decodes predicate columns first, builds a //! `RowSelection`, and then decodes output columns for selected rows. The -//! fallback path in this module instead decodes the union of predicate and +//! The post-filter path in this module instead decodes the union of predicate and //! output columns once and applies predicates after decode. //! //! ```text @@ -39,6 +39,7 @@ //! and little pruning, especially fragmented high-selectivity selections. use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; use crate::arrow::arrow_reader::{RowFilter, RowSelection}; use crate::errors::{ParquetError, Result}; use crate::schema::types::SchemaDescriptor; @@ -51,6 +52,7 @@ use std::sync::{Arc, Mutex}; #[derive(Debug)] pub(super) struct PostFilterState { filter: Arc>, + metrics: ArrowReaderMetrics, predicate_projection_indices: Vec>, predicate_projection_schemas: Vec, output_projection_indices: Vec, @@ -60,6 +62,7 @@ pub(super) struct PostFilterState { impl PostFilterState { pub(super) fn try_new( filter: Arc>, + metrics: ArrowReaderMetrics, parquet_schema: &SchemaDescriptor, read_schema: &Schema, read_projection: &ProjectionMask, @@ -92,6 +95,7 @@ impl PostFilterState { Ok(Self { filter, + metrics, predicate_projection_indices, predicate_projection_schemas, output_projection_indices, @@ -114,12 +118,20 @@ impl PostFilterState { .enumerate() { let input_rows = batch.num_rows(); - let predicate_batch = project_record_batch( - &batch, - projection_indices, - Arc::clone(&self.predicate_projection_schemas[predicate_idx]), - )?; - let predicate_filter = predicate.evaluate(predicate_batch)?; + let predicate_batch = + self.metrics + .time_phase(ArrowReaderPhase::PostFilterPredicateProject, || { + project_record_batch( + &batch, + projection_indices, + Arc::clone(&self.predicate_projection_schemas[predicate_idx]), + ) + })?; + let predicate_filter = self + .metrics + .time_phase(ArrowReaderPhase::PostFilterPredicateEvaluate, || { + predicate.evaluate(predicate_batch) + })?; if predicate_filter.len() != input_rows { return Err(general_err!( @@ -128,17 +140,25 @@ impl PostFilterState { )); } - batch = filter_record_batch(&batch, &predicate_filter)?; + batch = self + .metrics + .time_phase(ArrowReaderPhase::PostFilterApplyFilter, || { + filter_record_batch(&batch, &predicate_filter) + })?; if batch.num_rows() == 0 { break; } } - Ok(project_record_batch( - &batch, - &self.output_projection_indices, - Arc::clone(&self.output_schema), - )?) + Ok(self + .metrics + .time_phase(ArrowReaderPhase::PostFilterOutputProject, || { + project_record_batch( + &batch, + &self.output_projection_indices, + Arc::clone(&self.output_schema), + ) + })?) } } @@ -146,20 +166,22 @@ impl PostFilterState { pub(super) struct PostSelectionFilterState { mask: BooleanBuffer, position: usize, + metrics: ArrowReaderMetrics, } impl PostSelectionFilterState { - pub(super) fn new(selection: RowSelection) -> Self { + pub(super) fn new(selection: RowSelection, metrics: ArrowReaderMetrics) -> Self { Self { mask: selection.boolean_mask(), position: 0, + metrics, } } pub(super) fn apply(&mut self, batch: RecordBatch) -> Result { // This path is not predicate post-filtering. It is used after pushdown // has already computed a final RowSelection for the current row group, - // but fallback chooses to decode the base selection and apply that + // but the post-filter path decodes the base selection and applies that // already-computed selection after decode. let input_rows = batch.num_rows(); let end = self.position.saturating_add(input_rows); @@ -172,7 +194,11 @@ impl PostSelectionFilterState { let filter = BooleanArray::from(self.mask.slice(self.position, input_rows)); self.position = end; - Ok(filter_record_batch(&batch, &filter)?) + Ok(self + .metrics + .time_phase(ArrowReaderPhase::PostSelectionApplyFilter, || { + filter_record_batch(&batch, &filter) + })?) } } @@ -232,7 +258,7 @@ fn projection_indices( let root = parquet_schema.get_column_root(leaf_idx); if !root.is_primitive() { return Err(general_err!( - "post-filter fallback does not support nested read column {}", + "post-filter cost model does not support nested read column {}", root.name() )); } diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index aafe6166994b..bc423d81d1ac 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -19,7 +19,7 @@ //! from a Parquet file use crate::arrow::array_reader::ArrayReader; -use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; use crate::arrow::arrow_reader::selection::{ LoadedRowRanges, RowSelectionPolicy, RowSelectionShape, RowSelectionStrategy, RowSelectionStrategyDecision, RowSelectionStrategyReason, @@ -45,6 +45,7 @@ pub struct PredicateOptions<'a> { predicate: &'a mut dyn ArrowPredicate, limit: Option, total_rows: usize, + metrics: ArrowReaderMetrics, } impl<'a> PredicateOptions<'a> { @@ -60,6 +61,7 @@ impl<'a> PredicateOptions<'a> { predicate, limit: None, total_rows: 0, + metrics: ArrowReaderMetrics::disabled(), } } @@ -83,6 +85,11 @@ impl<'a> PredicateOptions<'a> { self.total_rows = total_rows; self } + + pub(crate) fn with_metrics(mut self, metrics: ArrowReaderMetrics) -> Self { + self.metrics = metrics; + self + } } /// A builder for [`ReadPlan`] @@ -226,6 +233,7 @@ impl ReadPlanBuilder { predicate, limit, total_rows, + metrics, } = options; // Target length for the concatenated filter output: @@ -239,14 +247,21 @@ impl ReadPlanBuilder { None => limit.map(|_| total_rows), }; - let reader = ParquetRecordBatchReader::new(array_reader, self.clone().build()); + let mut reader = ParquetRecordBatchReader::new(array_reader, self.clone().build()); let mut filters = vec![]; let mut processed_rows: usize = 0; let mut matched_rows: usize = 0; - for maybe_batch in reader { + loop { + let maybe_batch = + metrics.time_phase(ArrowReaderPhase::PredicateDecode, || reader.next()); + let Some(maybe_batch) = maybe_batch else { + break; + }; let maybe_batch = maybe_batch?; let input_rows = maybe_batch.num_rows(); - let filter = predicate.evaluate(maybe_batch)?; + let filter = metrics.time_phase(ArrowReaderPhase::PredicateEvaluate, || { + predicate.evaluate(maybe_batch) + })?; // Since user supplied predicate, check error here to catch bugs quickly if filter.len() != input_rows { return Err(arrow_err!( @@ -294,9 +309,15 @@ impl ReadPlanBuilder { if all_selected && self.selection.is_none() { return Ok(self); } - let raw = RowSelection::from_filters(&filters); + let raw = metrics.time_phase(ArrowReaderPhase::PredicateSelectionBuild, || { + RowSelection::from_filters(&filters) + }); self.selection = match self.selection.take() { - Some(selection) => Some(selection.and_then(&raw)), + Some(selection) => Some( + metrics.time_phase(ArrowReaderPhase::PredicateSelectionMerge, || { + selection.and_then(&raw) + }), + ), None => Some(raw), }; Ok(self) @@ -663,12 +684,12 @@ mod tests { } #[test] - fn fallback_classifier_triggers_for_fragmented_high_selectivity() { + fn cost_model_classifier_triggers_for_fragmented_high_selectivity() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 2, shape: RowSelectionShape { selected_rows: 128, @@ -681,17 +702,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::FragmentedHighSelectivity + CostModelDecisionReason::FragmentedHighSelectivity ); } #[test] - fn fallback_classifier_waits_for_observation_window() { + fn cost_model_classifier_waits_for_observation_window() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 0, shape: RowSelectionShape { selected_rows: 64, @@ -704,17 +725,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::ObservationIncomplete + CostModelDecisionReason::ObservationIncomplete ); } #[test] - fn fallback_classifier_triggers_for_high_selectivity_without_pruning() { + fn cost_model_classifier_triggers_for_high_selectivity_without_pruning() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 2, shape: RowSelectionShape { selected_rows: 200, @@ -727,17 +748,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::HighSelectivityNoPruning + CostModelDecisionReason::HighSelectivityNoPruning ); } #[test] - fn fallback_classifier_triggers_for_fragmented_moderate_selectivity() { + fn cost_model_classifier_triggers_for_fragmented_moderate_selectivity() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 2, shape: RowSelectionShape { selected_rows: 30, @@ -750,17 +771,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::FragmentedModerateSelectivity + CostModelDecisionReason::FragmentedModerateSelectivity ); } #[test] - fn fallback_classifier_triggers_for_fragmented_near_ten_percent_selectivity() { + fn cost_model_classifier_triggers_for_fragmented_near_ten_percent_selectivity() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 1, shape: RowSelectionShape { selected_rows: 9, @@ -773,17 +794,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::FragmentedModerateSelectivity + CostModelDecisionReason::FragmentedModerateSelectivity ); } #[test] - fn fallback_classifier_keeps_q38_like_low_selectivity_fragmented_pushdown() { + fn cost_model_classifier_keeps_q38_like_low_selectivity_fragmented_pushdown() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 1, shape: RowSelectionShape { selected_rows: 4_870, @@ -796,17 +817,17 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::PushdownStillPreferred + CostModelDecisionReason::PushdownStillPreferred ); } #[test] - fn fallback_classifier_keeps_low_selectivity_fragmented_pushdown() { + fn cost_model_classifier_keeps_low_selectivity_fragmented_pushdown() { use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, }; - let observation = FallbackObservation { + let observation = CostModelObservation { observed_row_groups: 1, shape: RowSelectionShape { selected_rows: 4, @@ -819,7 +840,7 @@ mod tests { assert_eq!( observation.trigger_reason(), - FallbackTriggerReason::PushdownStillPreferred + CostModelDecisionReason::PushdownStillPreferred ); } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 72c44e169b4b..7bb835aaac60 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -174,7 +174,7 @@ impl RowSelectionShape { #[allow(dead_code)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum FallbackTriggerReason { +pub(crate) enum CostModelDecisionReason { /// Predicate pushdown kept almost everything and did not produce useful pruning. HighSelectivityNoPruning, /// Fragmented runs with moderate selectivity often pay many small skip/read costs. @@ -190,7 +190,7 @@ pub(crate) enum FallbackTriggerReason { } /// Aggregate row-selection shape observed while deciding whether Auto should -/// continue predicate pushdown or fall back to post-filter execution. +/// continue predicate pushdown or switch to post-filter execution. /// /// The classifier looks for shapes where row-level pushdown is unlikely to /// recover its own overhead: @@ -201,49 +201,49 @@ pub(crate) enum FallbackTriggerReason { /// high selected ratio -> most output rows are decoded anyway /// ``` #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub(crate) struct FallbackObservation { +pub(crate) struct CostModelObservation { pub(crate) observed_row_groups: usize, pub(crate) shape: RowSelectionShape, } -impl FallbackObservation { +impl CostModelObservation { pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; const FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; - pub(crate) fn trigger_reason(self) -> FallbackTriggerReason { + pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { - return FallbackTriggerReason::ObservationIncomplete; + return CostModelDecisionReason::ObservationIncomplete; } let shape = self.shape; if shape.total_rows() > 0 && shape.skipped_rows == 0 && shape.selected_ratio() >= 0.95 { - return FallbackTriggerReason::HighSelectivityNoPruning; + return CostModelDecisionReason::HighSelectivityNoPruning; } let fragmented = shape.average_selected_run_length() <= 4.0 && shape.run_density() >= 0.01; if !fragmented { - return FallbackTriggerReason::PushdownStillPreferred; + return CostModelDecisionReason::PushdownStillPreferred; } let selected_ratio = shape.selected_ratio(); if (Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { - return FallbackTriggerReason::FragmentedModerateSelectivity; + return CostModelDecisionReason::FragmentedModerateSelectivity; } if selected_ratio < 0.50 { - return FallbackTriggerReason::PushdownStillPreferred; + return CostModelDecisionReason::PushdownStillPreferred; } - FallbackTriggerReason::FragmentedHighSelectivity + CostModelDecisionReason::FragmentedHighSelectivity } #[allow(dead_code)] - pub(crate) fn should_fallback(self) -> bool { + pub(crate) fn prefers_post_filter(self) -> bool { matches!( self.trigger_reason(), - FallbackTriggerReason::HighSelectivityNoPruning - | FallbackTriggerReason::FragmentedModerateSelectivity - | FallbackTriggerReason::FragmentedHighSelectivity + CostModelDecisionReason::HighSelectivityNoPruning + | CostModelDecisionReason::FragmentedModerateSelectivity + | CostModelDecisionReason::FragmentedHighSelectivity ) } } diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 0463e914a1f4..a6c8b84a4eaf 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -404,7 +404,7 @@ impl ParquetDecoderState { ) -> Result<(Self, DecodeResult), ParquetError> { let mut current_state = self; loop { - current_state.disable_post_filter_fallback(); + current_state.disable_post_filter_cost_model(); let (next_state, decode_result) = current_state.transition()?; // if more data is needed to transition, can't proceed further without it match decode_result { @@ -424,7 +424,7 @@ impl ParquetDecoderState { } => { // The reader API can advance to future row groups before // the returned reader is consumed. Disable post-filter - // fallback before building row groups for this API; this + // cost modeling before building row groups for this API; this // materialization remains only as a guard for mixed API use // where a post-filter reader was already active. record_batch_reader.materialize_post_filter()?; @@ -441,12 +441,12 @@ impl ParquetDecoderState { } } - fn disable_post_filter_fallback(&mut self) { + fn disable_post_filter_cost_model(&mut self) { if let Self::ReadingRowGroup { remaining_row_groups, } = self { - remaining_row_groups.disable_post_filter_fallback(); + remaining_row_groups.disable_post_filter_cost_model(); } } @@ -623,21 +623,29 @@ mod test { use crate::DecodeResult; use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; use crate::arrow::arrow_reader::{ - ArrowPredicateFn, ParquetRecordBatchReader, RowFilter, RowSelection, RowSelectionPolicy, - RowSelector, + ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, RowSelection, + RowSelectionPolicy, RowSelector, }; + use crate::arrow::async_reader::AsyncFileReader; use crate::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; - use crate::arrow::{ArrowWriter, ProjectionMask}; + use crate::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use crate::errors::ParquetError; - use crate::file::metadata::ParquetMetaDataPushDecoder; + use crate::file::metadata::{ + PageIndexPolicy, ParquetMetaData, ParquetMetaDataPushDecoder, ParquetMetaDataReader, + }; use crate::file::properties::WriterProperties; - use arrow::compute::kernels::cmp::{gt, lt}; + use arrow::compute::kernels::cmp::{gt, lt, neq}; + use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int64Type; use arrow_array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray}; + use arrow_schema::{DataType, Field, Schema}; use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; use bytes::Bytes; + use futures::future::BoxFuture; + use futures::{FutureExt, StreamExt}; + use rand::{Rng, SeedableRng, rngs::StdRng}; use std::fmt::Debug; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -1080,8 +1088,8 @@ mod test { } #[test] - fn test_decoder_auto_fallback_uses_post_filter_after_observation() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_auto_cost_model_uses_post_filter_after_observation() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1120,7 +1128,7 @@ mod test { assert_eq!( predicate_rows.load(Ordering::Relaxed), 300, - "fallback should evaluate predicates while producing the current row group" + "cost model should evaluate predicates while producing the current row group" ); assert_eq!(batch, TEST_BATCH.slice(200, 100).project(&[2]).unwrap()); assert_eq!(predicate_rows.load(Ordering::Relaxed), 300); @@ -1130,19 +1138,19 @@ mod test { assert_eq!(batch, TEST_BATCH.slice(300, 100).project(&[2]).unwrap()); assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); - assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); assert_eq!( - metrics.fallback_high_selectivity_no_pruning_count(), + metrics.cost_model_high_selectivity_no_pruning_count(), Some(1) ); assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] - fn test_decoder_try_next_reader_skips_post_filter_fallback() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_try_next_reader_skips_post_filter_cost_model() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1178,13 +1186,13 @@ mod test { assert!(reader.next().is_none()); } - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); assert!(next_reader_with_data(&mut decoder, data).is_none()); } #[test] - fn test_decoder_auto_fallback_post_filter_applies_fragmented_filter() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_auto_cost_model_post_filter_applies_fragmented_filter() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1226,7 +1234,7 @@ mod test { assert_eq!( predicate_rows.load(Ordering::Relaxed), (row_group_idx + 1) * 100, - "fallback should evaluate predicates while producing the current row group" + "cost model should evaluate predicates while producing the current row group" ); assert_eq!( batch, @@ -1238,19 +1246,19 @@ mod test { ); } - assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); assert_eq!( - metrics.fallback_fragmented_high_selectivity_count(), + metrics.cost_model_fragmented_high_selectivity_count(), Some(1) ); assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] - fn test_decoder_auto_fallback_records_fragmented_moderate_selectivity() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_auto_cost_model_records_fragmented_moderate_selectivity() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1288,19 +1296,19 @@ mod test { ); } - assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(4)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); assert_eq!( - metrics.fallback_fragmented_moderate_selectivity_count(), + metrics.cost_model_fragmented_moderate_selectivity_count(), Some(1) ); assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] - fn test_decoder_auto_fallback_current_row_uses_predicate_cache() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_auto_cost_model_current_row_uses_predicate_cache() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1327,14 +1335,14 @@ mod test { let batch = next_batch_with_data(&mut decoder, data).unwrap(); assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[0, 2]).unwrap()); - assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(1)); assert_eq!(metrics.records_read_from_cache(), Some(100)); } #[test] - fn test_decoder_auto_fallback_with_row_selection_does_not_evaluate_current_row_group_twice() { - let data = &FALLBACK_TEST_FILE_DATA; + fn test_decoder_auto_cost_model_with_row_selection_does_not_evaluate_current_row_group_twice() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1373,7 +1381,7 @@ mod test { assert_eq!( predicate_rows.load(Ordering::Relaxed), 50, - "fallback observation must not re-run the predicate for the same row group" + "cost-model observation must not re-run the predicate for the same row group" ); assert_eq!(batch, expected_c_every_other(0, 100)); @@ -1381,9 +1389,9 @@ mod test { assert_eq!(predicate_rows.load(Ordering::Relaxed), 150); assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); - assert_eq!(metrics.fallback_observed_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(1)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(1)); } #[test] @@ -1491,11 +1499,11 @@ mod test { ); } - /// Auto post-filter fallback is disabled for `LIMIT` because the limit is + /// Auto post-filter cost modeling is disabled for `LIMIT` because the limit is /// applied during row-group planning. Limit scans should therefore avoid - /// fallback observation bookkeeping entirely. + /// cost-model observation bookkeeping entirely. #[test] - fn test_decoder_filter_with_limit_skips_auto_fallback_observation() { + fn test_decoder_filter_with_limit_skips_auto_cost_model_observation() { let builder = ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); @@ -1530,9 +1538,9 @@ mod test { assert_eq!(batch, expected); expect_finished(decoder.try_decode()); - assert_eq!(metrics.fallback_observed_row_group_count(), Some(0)); - assert_eq!(metrics.fallback_pushdown_row_group_count(), Some(0)); - assert_eq!(metrics.fallback_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); } /// Once the limit has been satisfied by a prior row group, subsequent @@ -1849,6 +1857,60 @@ mod test { expect_finished(decoder.try_decode()); } + #[test] + #[ignore = "local profiling aid for row-filter phase breakdowns"] + fn profile_utf8_view_row_filter_phases() { + const TOTAL_ROWS: usize = 500_000; + const ROW_GROUP_SIZE: usize = 100_000; + + let parquet_file = Bytes::from(write_utf8_profile_parquet_file(TOTAL_ROWS, ROW_GROUP_SIZE)); + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + for (name, policy) in [ + ("auto", RowSelectionPolicy::default()), + ("mask", RowSelectionPolicy::Mask), + ("selectors", RowSelectionPolicy::Selectors), + ] { + let reader = ProfileInMemoryReader::try_new(&parquet_file).unwrap(); + let schema_descr = reader.metadata().file_metadata().schema_descr(); + let projection = ProjectionMask::roots(schema_descr, [0, 1, 2, 3]); + let predicate_projection = ProjectionMask::roots(schema_descr, [2]); + let row_filter = RowFilter::new(vec![Box::new(ArrowPredicateFn::new( + predicate_projection, + |batch| { + let array = batch.column(batch.schema().index_of("utf8View")?); + neq(array, &StringViewArray::new_scalar("")) + }, + ))]); + let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + + runtime.block_on(async { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection) + .with_row_filter(row_filter) + .with_row_selection_policy(policy) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let mut rows = 0; + while let Some(batch) = stream.next().await { + rows += batch.unwrap().num_rows(); + } + assert!(rows > 0 && rows < TOTAL_ROWS); + }); + + println!("phase profile: {name}"); + println!("{}", metrics.phase_profile_report().unwrap()); + } + } + /// Returns a batch with 400 rows, with 3 columns: "a", "b", "c" /// /// Note c is a different types (so the data page sizes will be different) @@ -1893,7 +1955,7 @@ mod test { /// c | "string_300".."string_399" | 2 | 1 static TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(200, 100)); - static FALLBACK_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); + static COST_MODEL_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); fn write_test_file(max_row_group_row_count: usize, data_page_row_count_limit: usize) -> Bytes { let input_batch = &TEST_BATCH; @@ -1919,6 +1981,115 @@ mod test { Bytes::from(output) } + fn write_utf8_profile_parquet_file(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_utf8_profile_batch(total_rows); + let props = WriterProperties::builder() + .set_compression(crate::basic::Compression::SNAPPY) + .set_max_row_group_row_count(Some(row_group_size)) + .build(); + let mut buffer = vec![]; + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + buffer + } + + fn create_utf8_profile_batch(size: usize) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("int64", DataType::Int64, false), + Field::new("float64", DataType::Float64, false), + Field::new("utf8View", DataType::Utf8View, true), + Field::new("ts", DataType::Int64, false), + ])); + + let int64 = Arc::new(Int64Array::from_iter_values(0..size as i64)) as ArrayRef; + let float64 = Arc::new(arrow_array::Float64Array::from_iter_values( + (0..size).map(|i| (i % 100) as f64), + )) as ArrayRef; + let utf8 = create_profile_utf8_view_array(size); + let ts = Arc::new(Int64Array::from_iter_values( + (0..size).map(|i| (i % 10_000) as i64), + )) as ArrayRef; + + RecordBatch::try_new(schema, vec![int64, float64, utf8, ts]).unwrap() + } + + fn create_profile_utf8_view_array(size: usize) -> ArrayRef { + const AVG_RUN_LENGTH: usize = 4; + const EMPTY_DENSITY: u32 = 85; + + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(44); + while builder.len() < size { + let mut run_length = rng.random_range(1..AVG_RUN_LENGTH); + if builder.len() + run_length > size { + run_length = size - builder.len(); + } + + if rng.random_range(0..100) < EMPTY_DENSITY { + for _ in 0..run_length { + builder.append_value(""); + } + } else { + for _ in 0..run_length { + builder.append_value(random_profile_string(&mut rng)); + } + } + } + Arc::new(builder.finish()) as ArrayRef + } + + fn random_profile_string(rng: &mut StdRng) -> String { + let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let len = if rng.random_bool(0.5) { + rng.random_range(13..21) + } else { + rng.random_range(3..12) + }; + (0..len) + .map(|_| charset[rng.random_range(0..charset.len())] as char) + .collect() + } + + #[derive(Debug, Clone)] + struct ProfileInMemoryReader { + inner: Bytes, + metadata: Arc, + } + + impl ProfileInMemoryReader { + fn try_new(inner: &Bytes) -> crate::errors::Result { + let mut metadata_reader = + ParquetMetaDataReader::new().with_page_index_policy(PageIndexPolicy::Required); + metadata_reader.try_parse(inner)?; + let metadata = metadata_reader.finish().map(Arc::new)?; + + Ok(Self { + inner: inner.clone(), + metadata, + }) + } + + fn metadata(&self) -> &Arc { + &self.metadata + } + } + + impl AsyncFileReader for ProfileInMemoryReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, crate::errors::Result> { + let data = self.inner.slice(range.start as usize..range.end as usize); + async move { Ok(data) }.boxed() + } + + fn get_metadata<'a>( + &'a mut self, + _options: Option<&'a ArrowReaderOptions>, + ) -> BoxFuture<'a, crate::errors::Result>> { + let metadata = Arc::clone(&self.metadata); + async move { Ok(metadata) }.boxed() + } + } + /// Return the length of [`TEST_FILE_DATA`], in bytes fn test_file_len() -> u64 { TEST_FILE_DATA.len() as u64 diff --git a/parquet/src/arrow/push_decoder/reader_builder/fallback.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs similarity index 63% rename from parquet/src/arrow/push_decoder/reader_builder/fallback.rs rename to parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 9e050223fff7..cf4b440c1b4b 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/fallback.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Runtime post-filter fallback decisions for push decoder row groups. +//! Runtime post-filter cost decisions for push decoder row groups. //! -//! The fallback is intentionally adaptive rather than purely static. The first +//! The cost model is intentionally adaptive rather than purely static. The first //! eligible row group is evaluated with predicate pushdown so the reader can //! observe the actual `RowSelection` shape produced by the predicate chain. //! Later row groups may then switch to post-filter execution if the observed @@ -31,10 +31,10 @@ //! | //! +-- pushdown still preferred ------> UsePushdown //! | -//! +-- fallback trigger + supported --> UsePostFilter +//! +-- post-filter preferred + supported --> UsePostFilter //! ``` //! -//! Fallback only applies to `Auto`. Explicit `Mask` and `Selectors` are treated +//! The cost model only applies to `Auto`. Explicit `Mask` and `Selectors` are treated //! as user intent and are not overridden here. use super::{RowBudget, RowGroupReaderBuilder}; @@ -42,31 +42,32 @@ use crate::arrow::ProjectionMask; use crate::arrow::arrow_reader::RowFilter; use crate::arrow::arrow_reader::RowSelectionPolicy; use crate::arrow::arrow_reader::selection::{ - FallbackObservation, FallbackTriggerReason, RowSelectionShape, RowSelectionStrategyDecision, + CostModelDecisionReason, CostModelObservation, RowSelectionShape, RowSelectionStrategyDecision, }; use crate::arrow::schema::{ParquetField, ParquetFieldType}; +use crate::basic::Type as PhysicalType; #[allow(dead_code)] #[derive(Debug)] -pub(super) enum RowGroupFallbackState { +pub(super) enum RowGroupCostModelState { /// Collect row-selection shape from early row groups before choosing a mode. - Observing { observation: FallbackObservation }, + Observing { observation: CostModelObservation }, /// Predicate pushdown remains the execution mode for this reader. UsePushdown, /// Later row groups should decode once and evaluate predicates after decode. - UsePostFilter { reason: FallbackTriggerReason }, + UsePostFilter { reason: CostModelDecisionReason }, } -impl Default for RowGroupFallbackState { +impl Default for RowGroupCostModelState { fn default() -> Self { Self::Observing { - observation: FallbackObservation::default(), + observation: CostModelObservation::default(), } } } impl RowGroupReaderBuilder { - pub(super) fn should_use_post_filter_fallback(&self, budget: RowBudget) -> bool { + pub(super) fn should_use_post_filter_by_cost(&self, budget: RowBudget) -> bool { // Keep the runtime switch narrow: // // * `Auto` means the caller allowed the reader to choose. @@ -75,9 +76,9 @@ impl RowGroupReaderBuilder { // * virtual columns are not read from Parquet pages and need their // existing projection path. matches!( - self.fallback_state, - RowGroupFallbackState::UsePostFilter { .. } - ) && self.post_filter_fallback_enabled + self.cost_model_state, + RowGroupCostModelState::UsePostFilter { .. } + ) && self.post_filter_cost_model_enabled && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && budget.is_unbounded() && !self.has_virtual_columns() @@ -88,13 +89,42 @@ impl RowGroupReaderBuilder { filter: &RowFilter, budget: RowBudget, ) -> Option { - if !self.should_use_post_filter_fallback(budget) { + if !self.should_use_post_filter_by_cost(budget) { return None; } self.build_post_filter_read_projection(filter) } + pub(super) fn post_filter_read_projection_for_filter( + &self, + filter: &RowFilter, + budget: RowBudget, + ) -> Option { + if !self.post_filter_supports_filter(filter, budget) { + return None; + } + + self.build_post_filter_read_projection(filter) + } + + pub(super) fn should_start_with_post_filter_for_predicate_cost( + &self, + filter: &RowFilter, + row_group_idx: usize, + budget: RowBudget, + ) -> bool { + if !self.post_filter_supports_filter(filter, budget) { + return false; + } + + let Some(predicate_projection) = filter.union_projection() else { + return false; + }; + + self.projection_has_variable_width_leaf(row_group_idx, &predicate_projection) + } + fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { // Post-filter execution decodes each row once, so it needs both: // @@ -116,15 +146,27 @@ impl RowGroupReaderBuilder { fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { // The post-filter reader currently projects record batches by parquet // leaf column position. Nested roots can span multiple leaves and need - // the existing array-reader projection machinery, so keep fallback to - // primitive roots only. + // the existing array-reader projection machinery, so allow the + // post-filter cost path for primitive roots only. let schema = self.metadata.file_metadata().schema_descr(); (0..schema.num_columns()).all(|leaf_idx| { !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() }) } - pub(super) fn observe_fallback_candidate( + fn projection_has_variable_width_leaf( + &self, + row_group_idx: usize, + projection: &ProjectionMask, + ) -> bool { + let row_group = self.metadata.row_group(row_group_idx); + (0..row_group.num_columns()).any(|leaf_idx| { + projection.leaf_included(leaf_idx) + && row_group.column(leaf_idx).column_type() == PhysicalType::BYTE_ARRAY + }) + } + + pub(super) fn observe_cost_model_candidate( &mut self, decision: RowSelectionStrategyDecision, row_count: usize, @@ -134,14 +176,14 @@ impl RowGroupReaderBuilder { return; } - let RowGroupFallbackState::Observing { observation } = &mut self.fallback_state else { + let RowGroupCostModelState::Observing { observation } = &mut self.cost_model_state else { return; }; let mut shape = decision.shape; if shape.total_rows() == 0 { // `None` selection means the predicate kept the whole row group. - // Represent it as one selected run so the fallback classifier can + // Represent it as one selected run so the cost model can // treat "no pruning" as an observed high-selectivity case. shape = RowSelectionShape { selected_rows: row_count, @@ -154,29 +196,33 @@ impl RowGroupReaderBuilder { observation.observed_row_groups += 1; observation.shape.add_assign(shape); - self.metrics.record_fallback_observed_row_group(); + self.metrics.record_cost_model_observed_row_group(); let reason = observation.trigger_reason(); - if matches!(reason, FallbackTriggerReason::ObservationIncomplete) { - self.metrics.record_fallback_trigger(reason); + if matches!(reason, CostModelDecisionReason::ObservationIncomplete) { + self.metrics.record_cost_model_trigger(reason); return; } - let should_fallback = observation.should_fallback(); - self.metrics.record_fallback_trigger(reason); + let prefers_post_filter = observation.prefers_post_filter(); + self.metrics.record_cost_model_trigger(reason); - if should_fallback && self.post_filter_fallback_supported(budget) { - self.fallback_state = RowGroupFallbackState::UsePostFilter { reason }; + if prefers_post_filter && self.post_filter_cost_model_supported(budget) { + self.cost_model_state = RowGroupCostModelState::UsePostFilter { reason }; } else { - self.fallback_state = RowGroupFallbackState::UsePushdown; + self.cost_model_state = RowGroupCostModelState::UsePushdown; } } - pub(super) fn post_filter_fallback_supported(&self, budget: RowBudget) -> bool { + pub(super) fn post_filter_cost_model_supported(&self, budget: RowBudget) -> bool { let Some(filter) = self.filter.as_ref() else { return false; }; - self.post_filter_fallback_enabled + self.post_filter_supports_filter(filter, budget) + } + + fn post_filter_supports_filter(&self, filter: &RowFilter, budget: RowBudget) -> bool { + self.post_filter_cost_model_enabled && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && budget.is_unbounded() && !self.has_virtual_columns() diff --git a/parquet/src/arrow/push_decoder/reader_builder/data.rs b/parquet/src/arrow/push_decoder/reader_builder/data.rs index 498f47981864..04b048ac5763 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/data.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/data.rs @@ -114,7 +114,7 @@ impl DataRequest { /// Return previously loaded column chunks if they are all dense. /// /// Sparse chunks may only contain pages for the predicate selection and are - /// unsafe to reuse for a fallback read over the base selection. + /// unsafe to reuse for a post-filter read over the base selection. pub fn into_dense_column_chunks(self) -> Option>>> { self.column_chunks .iter() diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index fb196d32e24b..8ac1452e4ea6 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -15,22 +15,22 @@ // specific language governing permissions and limitations // under the License. +mod cost_model; mod data; -mod fallback; mod filter; mod selection_policy; use crate::arrow::ProjectionMask; use crate::arrow::array_reader::{ArrayReaderBuilder, CacheOptions, RowGroupCache}; -use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; use crate::arrow::arrow_reader::selection::RowGroupExecutionMode; use crate::arrow::arrow_reader::{ ParquetRecordBatchReader, PredicateOptions, ReadPlanBuilder, RowFilter, RowSelection, RowSelectionPolicy, RowSelector, }; use crate::arrow::in_memory_row_group::ColumnChunkData; +use crate::arrow::push_decoder::reader_builder::cost_model::RowGroupCostModelState; use crate::arrow::push_decoder::reader_builder::data::DataRequestBuilder; -use crate::arrow::push_decoder::reader_builder::fallback::RowGroupFallbackState; use crate::arrow::push_decoder::reader_builder::filter::CacheInfo; use crate::arrow::push_decoder::reader_builder::selection_policy::{ ExpensiveOutputProfile, resolve_selection_policy_for_expensive_output, @@ -61,7 +61,7 @@ struct RowGroupInfo { budget: RowBudget, } -enum FallbackTransition { +enum CostModelTransition { ContinuePushdown, StartPostSelection { selection: RowSelection }, EnablePostFilter, @@ -102,7 +102,7 @@ enum RowGroupDecoderState { filter: Arc>, }, /// Needs data to read the row group once and apply an already-computed - /// fallback selection after decode. + /// selection after decode. WaitingOnPostSelectionData { row_group_info: RowGroupInfo, data_request: DataRequest, @@ -278,7 +278,7 @@ pub(crate) struct RowGroupReaderBuilder { /// Optional filter filter: Option, - /// Predicate state reused by later row groups once Auto fallback switches to post-filter. + /// Predicate state reused by later row groups once Auto chooses post-filter. post_filter: Option>>, /// The size in bytes of the predicate cache to use @@ -292,11 +292,11 @@ pub(crate) struct RowGroupReaderBuilder { /// Strategy for materialising row selections row_selection_policy: RowSelectionPolicy, - /// Row-group-local fallback state used by Auto policy. - fallback_state: RowGroupFallbackState, + /// Row-group-local cost-model state used by Auto policy. + cost_model_state: RowGroupCostModelState, - /// Whether this builder may switch Auto policy to post-filter fallback. - post_filter_fallback_enabled: bool, + /// Whether this builder may switch Auto policy to post-filter by cost. + post_filter_cost_model_enabled: bool, /// Current state of the decoder. /// @@ -332,8 +332,8 @@ impl RowGroupReaderBuilder { metrics, max_predicate_cache_size, row_selection_policy, - fallback_state: RowGroupFallbackState::default(), - post_filter_fallback_enabled: true, + cost_model_state: RowGroupCostModelState::default(), + post_filter_cost_model_enabled: true, state: Some(RowGroupDecoderState::Finished), buffers, } @@ -354,10 +354,10 @@ impl RowGroupReaderBuilder { self.buffers.clear_all_ranges(); } - /// Disable post-filter fallback for APIs that hand row-group readers back to + /// Disable post-filter cost modeling for APIs that hand row-group readers back to /// callers before they are consumed. - pub(crate) fn disable_post_filter_fallback(&mut self) { - self.post_filter_fallback_enabled = false; + pub(crate) fn disable_post_filter_cost_model(&mut self) { + self.post_filter_cost_model_enabled = false; } fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { @@ -366,22 +366,27 @@ impl RowGroupReaderBuilder { } let filter = self.filter.take().ok_or_else(|| { - ParquetError::General("post-filter fallback selected without a row filter".to_string()) + ParquetError::General( + "post-filter cost model selected without a row filter".to_string(), + ) })?; self.post_filter = Some(Arc::new(Mutex::new(filter))); Ok(()) } - fn resolve_fallback_transition( + fn resolve_cost_model_transition( &mut self, row_group_info: &RowGroupInfo, cache_info: Option<&CacheInfo>, - ) -> Result { + ) -> Result { if cache_info.is_none() - || !matches!(self.fallback_state, RowGroupFallbackState::Observing { .. }) - || !self.post_filter_fallback_supported(row_group_info.budget) + || !matches!( + self.cost_model_state, + RowGroupCostModelState::Observing { .. } + ) + || !self.post_filter_cost_model_supported(row_group_info.budget) { - return Ok(FallbackTransition::ContinuePushdown); + return Ok(CostModelTransition::ContinuePushdown); } let decision = row_group_info @@ -389,28 +394,32 @@ impl RowGroupReaderBuilder { .resolve_selection_strategy_decision(); let observed_selection = row_group_info.plan_builder.selection().cloned(); - self.observe_fallback_candidate(decision, row_group_info.row_count, row_group_info.budget); + self.observe_cost_model_candidate( + decision, + row_group_info.row_count, + row_group_info.budget, + ); if matches!( - self.fallback_state, - RowGroupFallbackState::UsePostFilter { .. } + self.cost_model_state, + RowGroupCostModelState::UsePostFilter { .. } ) { if row_group_info.base_selection.is_none() { let selection = observed_selection.unwrap_or_else(|| { RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) }); - return Ok(FallbackTransition::StartPostSelection { selection }); + return Ok(CostModelTransition::StartPostSelection { selection }); } self.ensure_post_filter_state()?; self.metrics - .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - return Ok(FallbackTransition::EnablePostFilter); + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + return Ok(CostModelTransition::EnablePostFilter); } self.metrics - .record_fallback_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - Ok(FallbackTransition::ContinuePushdown) + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + Ok(CostModelTransition::ContinuePushdown) } /// take the current state, leaving None in its place. @@ -538,7 +547,17 @@ impl RowGroupReaderBuilder { })); }; - if self.should_use_post_filter_fallback(row_group_info.budget) { + if self.should_start_with_post_filter_for_predicate_cost( + &filter, + row_group_info.row_group_idx, + row_group_info.budget, + ) { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + return self.start_post_filter(row_group_info, filter); + } + + if self.should_use_post_filter_by_cost(row_group_info.budget) { if self .post_filter_read_projection(&filter, row_group_info.budget) .is_some() @@ -548,7 +567,7 @@ impl RowGroupReaderBuilder { return self.start_post_filter(row_group_info, filter); } - self.fallback_state = RowGroupFallbackState::UsePushdown; + self.cost_model_state = RowGroupCostModelState::UsePushdown; } // we have predicates to evaluate @@ -601,18 +620,22 @@ impl RowGroupReaderBuilder { // need to fetch pages the column needs for decoding, figure // that out based on the current selection and projection - let data_request = DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - predicate.projection(), // use the predicate's projection - ) - .with_selection(plan_builder.selection()) - // Fetch predicate columns; expand selection only for cached predicate columns - .with_cache_projection(Some(filter_info.cache_projection())) - .with_column_chunks(column_chunks) - .build(); + let data_request = + self.metrics + .time_phase(ArrowReaderPhase::PredicateRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + predicate.projection(), // use the predicate's projection + ) + .with_selection(plan_builder.selection()) + // Fetch predicate columns; expand selection only for cached predicate columns + .with_cache_projection(Some(filter_info.cache_projection())) + .with_column_chunks(column_chunks) + .build() + }); let row_group_info = RowGroupInfo { row_group_idx, @@ -699,7 +722,8 @@ impl RowGroupReaderBuilder { // early termination when this is the final predicate and an // output limit was set. let mut predicate_options = - PredicateOptions::new(array_reader, filter_info.current_mut()); + PredicateOptions::new(array_reader, filter_info.current_mut()) + .with_metrics(self.metrics.clone()); if let Some(limit) = predicate_limit { predicate_options = predicate_options.with_limit(limit, row_count); } @@ -774,31 +798,39 @@ impl RowGroupReaderBuilder { )); } - let data_request = DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - &self.projection, - ) - .with_selection(plan_builder.selection()) - .with_column_chunks(column_chunks) - // Final projection fetch shouldn't expand selection for cache - // so don't call with_cache_projection here - .build(); + let data_request = + self.metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &self.projection, + ) + .with_selection(plan_builder.selection()) + .with_column_chunks(column_chunks) + // Final projection fetch shouldn't expand selection for cache + // so don't call with_cache_projection here + .build() + }); plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &self.projection, - self.row_group_offset_index(row_group_idx), - row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - &self.projection, - row_count, - ), - ); + plan_builder = + self.metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + resolve_selection_policy_for_expensive_output( + plan_builder, + &self.projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + &self.projection, + row_count, + ), + ) + }); let row_group_info = RowGroupInfo { row_group_idx, @@ -862,10 +894,11 @@ impl RowGroupReaderBuilder { self.metadata.file_metadata().schema_descr(), &read_projection, &self.projection, + self.metrics.clone(), )?; self.metrics - .record_fallback_row_group(RowGroupExecutionMode::PostFilter); + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); NextState::result( RowGroupDecoderState::Finished, @@ -928,10 +961,11 @@ impl RowGroupReaderBuilder { array_reader, plan, selection, + self.metrics.clone(), ); self.metrics - .record_fallback_row_group(RowGroupExecutionMode::PostFilter); + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); NextState::result( RowGroupDecoderState::Finished, @@ -947,10 +981,10 @@ impl RowGroupReaderBuilder { data_request, cache_info, } => { - match self.resolve_fallback_transition(&row_group_info, cache_info.as_ref())? { - FallbackTransition::ContinuePushdown | FallbackTransition::EnablePostFilter => { - } - FallbackTransition::StartPostSelection { selection } => { + match self.resolve_cost_model_transition(&row_group_info, cache_info.as_ref())? { + CostModelTransition::ContinuePushdown + | CostModelTransition::EnablePostFilter => {} + CostModelTransition::StartPostSelection { selection } => { let column_chunks = data_request.into_dense_column_chunks(); // The current row group already computed a pushdown selection. Apply that // selection after decode instead of evaluating the predicates again. @@ -1012,7 +1046,11 @@ impl RowGroupReaderBuilder { .build_array_reader(self.fields.as_deref(), &self.projection) }?; - let reader = ParquetRecordBatchReader::new(array_reader, plan); + let reader = ParquetRecordBatchReader::new_with_metrics( + array_reader, + plan, + self.metrics.clone(), + ); NextState::result( RowGroupDecoderState::Finished, RowGroupBuildResult::Data { @@ -1060,36 +1098,44 @@ impl RowGroupReaderBuilder { let filter = filter.lock().map_err(|_| { ParquetError::General("post-filter predicate state was poisoned".to_string()) })?; - self.post_filter_read_projection(&filter, budget) + self.post_filter_read_projection_for_filter(&filter, budget) .ok_or_else(|| { ParquetError::General( - "post-filter fallback selected an unsupported projection".to_string(), + "post-filter cost model selected an unsupported projection".to_string(), ) })? }; - let data_request = DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - &read_projection, - ) - .with_selection(plan_builder.selection()) - .build(); + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &read_projection, + ) + .with_selection(plan_builder.selection()) + .build() + }); plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &read_projection, - self.row_group_offset_index(row_group_idx), - row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - &read_projection, - row_count, - ), - ); + plan_builder = self + .metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + resolve_selection_policy_for_expensive_output( + plan_builder, + &read_projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + &read_projection, + row_count, + ), + ) + }); let row_group_info = RowGroupInfo { row_group_idx, @@ -1128,16 +1174,20 @@ impl RowGroupReaderBuilder { .with_selection(base_selection) .with_row_selection_policy(self.row_selection_policy); - let data_request = DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - &self.projection, - ) - .with_selection(plan_builder.selection()) - .with_column_chunks(column_chunks) - .build(); + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &self.projection, + ) + .with_selection(plan_builder.selection()) + .with_column_chunks(column_chunks) + .build() + }); let row_group_info = RowGroupInfo { row_group_idx, diff --git a/parquet/src/arrow/push_decoder/remaining.rs b/parquet/src/arrow/push_decoder/remaining.rs index c65b0d9c62fe..688d81d58bc6 100644 --- a/parquet/src/arrow/push_decoder/remaining.rs +++ b/parquet/src/arrow/push_decoder/remaining.rs @@ -228,10 +228,11 @@ impl RemainingRowGroups { self.row_group_reader_builder.clear_all_ranges(); } - /// Prevent Auto selection from switching to post-filter fallback for reader + /// Prevent Auto selection from switching to post-filter by cost for reader /// handoff APIs. - pub(crate) fn disable_post_filter_fallback(&mut self) { - self.row_group_reader_builder.disable_post_filter_fallback(); + pub(crate) fn disable_post_filter_cost_model(&mut self) { + self.row_group_reader_builder + .disable_post_filter_cost_model(); } /// returns [`ParquetRecordBatchReader`] suitable for reading the next From 55341f4a6d2d3e31c7f2d15f2a95e2d13cc73ed0 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 17 May 2026 14:46:50 +0800 Subject: [PATCH 11/32] fix(parquet): gate row filter profiling by async feature --- parquet/src/arrow/arrow_reader/metrics.rs | 8 +++--- parquet/src/arrow/push_decoder/mod.rs | 34 ++++++++++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index 382e26cb5828..0ca8cbf7a316 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -44,7 +44,7 @@ pub(crate) enum ArrowReaderPhase { impl ArrowReaderPhase { const COUNT: usize = 13; - #[cfg(test)] + #[cfg(all(test, feature = "async"))] const ALL: [Self; Self::COUNT] = [ Self::PredicateRangePlanning, Self::PredicateDecode, @@ -79,7 +79,7 @@ impl ArrowReaderPhase { } } - #[cfg(test)] + #[cfg(all(test, feature = "async"))] fn name(self) -> &'static str { match self { Self::PredicateRangePlanning => "predicate_range_planning", @@ -127,7 +127,7 @@ impl ArrowReaderMetrics { Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(false))) } - #[cfg(test)] + #[cfg(all(test, feature = "async"))] pub(crate) fn enabled_with_phase_profile() -> Self { Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(true))) } @@ -428,7 +428,7 @@ impl ArrowReaderMetrics { result } - #[cfg(test)] + #[cfg(all(test, feature = "async"))] pub(crate) fn phase_profile_report(&self) -> Option { let Self::Enabled(inner) = self else { return None; diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index a6c8b84a4eaf..8e5fa1e4987e 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -621,30 +621,42 @@ impl ParquetDecoderState { mod test { use super::*; use crate::DecodeResult; + #[cfg(feature = "async")] + use crate::arrow::ParquetRecordBatchStreamBuilder; + #[cfg(feature = "async")] + use crate::arrow::arrow_reader::ArrowReaderOptions; use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; use crate::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, RowSelection, - RowSelectionPolicy, RowSelector, + ArrowPredicateFn, ParquetRecordBatchReader, RowFilter, RowSelection, RowSelectionPolicy, + RowSelector, }; + #[cfg(feature = "async")] use crate::arrow::async_reader::AsyncFileReader; use crate::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; - use crate::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; + use crate::arrow::{ArrowWriter, ProjectionMask}; use crate::errors::ParquetError; - use crate::file::metadata::{ - PageIndexPolicy, ParquetMetaData, ParquetMetaDataPushDecoder, ParquetMetaDataReader, - }; + use crate::file::metadata::ParquetMetaDataPushDecoder; + #[cfg(feature = "async")] + use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; use crate::file::properties::WriterProperties; - use arrow::compute::kernels::cmp::{gt, lt, neq}; + #[cfg(feature = "async")] + use arrow::compute::kernels::cmp::neq; + use arrow::compute::kernels::cmp::{gt, lt}; + #[cfg(feature = "async")] use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int64Type; use arrow_array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray}; + #[cfg(feature = "async")] use arrow_schema::{DataType, Field, Schema}; use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; use bytes::Bytes; + #[cfg(feature = "async")] use futures::future::BoxFuture; + #[cfg(feature = "async")] use futures::{FutureExt, StreamExt}; + #[cfg(feature = "async")] use rand::{Rng, SeedableRng, rngs::StdRng}; use std::fmt::Debug; use std::ops::Range; @@ -1857,6 +1869,7 @@ mod test { expect_finished(decoder.try_decode()); } + #[cfg(feature = "async")] #[test] #[ignore = "local profiling aid for row-filter phase breakdowns"] fn profile_utf8_view_row_filter_phases() { @@ -1981,6 +1994,7 @@ mod test { Bytes::from(output) } + #[cfg(feature = "async")] fn write_utf8_profile_parquet_file(total_rows: usize, row_group_size: usize) -> Vec { let batch = create_utf8_profile_batch(total_rows); let props = WriterProperties::builder() @@ -1994,6 +2008,7 @@ mod test { buffer } + #[cfg(feature = "async")] fn create_utf8_profile_batch(size: usize) -> RecordBatch { let schema = Arc::new(Schema::new(vec![ Field::new("int64", DataType::Int64, false), @@ -2014,6 +2029,7 @@ mod test { RecordBatch::try_new(schema, vec![int64, float64, utf8, ts]).unwrap() } + #[cfg(feature = "async")] fn create_profile_utf8_view_array(size: usize) -> ArrayRef { const AVG_RUN_LENGTH: usize = 4; const EMPTY_DENSITY: u32 = 85; @@ -2039,6 +2055,7 @@ mod test { Arc::new(builder.finish()) as ArrayRef } + #[cfg(feature = "async")] fn random_profile_string(rng: &mut StdRng) -> String { let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; let len = if rng.random_bool(0.5) { @@ -2052,11 +2069,13 @@ mod test { } #[derive(Debug, Clone)] + #[cfg(feature = "async")] struct ProfileInMemoryReader { inner: Bytes, metadata: Arc, } + #[cfg(feature = "async")] impl ProfileInMemoryReader { fn try_new(inner: &Bytes) -> crate::errors::Result { let mut metadata_reader = @@ -2075,6 +2094,7 @@ mod test { } } + #[cfg(feature = "async")] impl AsyncFileReader for ProfileInMemoryReader { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, crate::errors::Result> { let data = self.inner.slice(range.start as usize..range.end as usize); From 235bf059f4ff9194a9e77127d772b1a3ba7a476e Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 17 May 2026 16:42:26 +0800 Subject: [PATCH 12/32] bench(parquet): add focused row filter cost model cases --- parquet/benches/arrow_reader_row_filter.rs | 134 ++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 46ab8551d36b..677059130205 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -207,7 +207,7 @@ fn write_parquet_file() -> Vec { /// ProjectionCase defines the projection mode for the benchmark: /// either projecting all columns or excluding the column that is used for filtering. -#[derive(Clone)] +#[derive(Clone, Copy)] enum ProjectionCase { AllColumns, ExcludeFilterColumn, @@ -751,6 +751,137 @@ fn benchmark_async_strategy_matrix(c: &mut Criterion) { } } +/// A small async-only matrix that isolates the cases most relevant to the +/// row-filter cost model. This is intentionally narrower than +/// [`benchmark_async_strategy_matrix`]: it keeps the benchmark output focused +/// on cases where `Auto` should either switch to post-filter execution or +/// explicitly keep predicate pushdown. +fn benchmark_async_cost_model_focus(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let cases = [ + ( + "utf8_non_empty", + FilterType::Utf8ViewNonEmpty, + ProjectionCase::ExcludeFilterColumn, + ), + ( + "utf8_non_empty", + FilterType::Utf8ViewNonEmpty, + ProjectionCase::AllColumns, + ), + ( + "high_selectivity_float64", + FilterType::UnselectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + ( + "high_selectivity_ts_clustered", + FilterType::UnselectiveClustered, + ProjectionCase::ExcludeFilterColumn, + ), + ( + "fragmented_int64_10pct", + FilterType::ModeratelySelectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + ( + "selective_float64_1pct", + FilterType::SelectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + ]; + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoCostModel, + AsyncStrategy::PushdownMask, + AsyncStrategy::PushdownSelectors, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_cost_model_focus"); + + for (case_name, filter_type, projection_case) in cases { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("{case_name}/{projection_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ) + .await + } + AsyncStrategy::PushdownAutoCostModel => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } + } +} + fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCase) -> Vec { let filter_columns = filter_type.filter_projection(); COLUMN_NAMES @@ -1066,6 +1197,7 @@ criterion_group!( benchmark_filters_and_projections, benchmark_sync_strategy_matrix, benchmark_async_strategy_matrix, + benchmark_async_cost_model_focus, benchmark_filters_with_limit, ); criterion_main!(benches); From d03f9209c31d712c2d1760c5c753e3ac90050e19 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 17 May 2026 18:31:46 +0800 Subject: [PATCH 13/32] Optimize post-filter selection resolve --- parquet/src/arrow/push_decoder/mod.rs | 57 +++++++++++++++++++ .../arrow/push_decoder/reader_builder/mod.rs | 34 +++++------ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 8e5fa1e4987e..7c7f2747b75a 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1202,6 +1202,48 @@ mod test { assert!(next_reader_with_data(&mut decoder, data).is_none()); } + #[test] + #[cfg(feature = "async")] + fn test_decoder_post_filter_without_base_selection_skips_output_selection_resolve() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + + let row_filter_c = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["c"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_c)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[0]) + .unwrap() + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + + let report = metrics.phase_profile_report().unwrap(); + assert_eq!(phase_profile_count(&report, "output_selection_resolve"), 0); + } + #[test] fn test_decoder_auto_cost_model_post_filter_applies_fragmented_filter() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -2167,6 +2209,21 @@ mod test { decoder.push_ranges(ranges, data).unwrap(); } + #[cfg(feature = "async")] + fn phase_profile_count(report: &str, phase: &str) -> usize { + report + .lines() + .skip(1) + .find_map(|line| { + let mut fields = line.split(','); + let name = fields.next()?; + let _total_ms = fields.next()?; + let count = fields.next()?; + (name == phase).then(|| count.parse().unwrap()) + }) + .unwrap_or(0) + } + fn not_multiple_of_three_filter(batch: &RecordBatch) -> BooleanArray { let column = batch.column(0).as_primitive::(); BooleanArray::from( diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 8ac1452e4ea6..26adcc2191f6 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -1120,22 +1120,24 @@ impl RowGroupReaderBuilder { .build() }); - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - plan_builder = self - .metrics - .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { - resolve_selection_policy_for_expensive_output( - plan_builder, - &read_projection, - self.row_group_offset_index(row_group_idx), - row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - &read_projection, - row_count, - ), - ) - }); + if plan_builder.selection().is_some() { + plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); + plan_builder = + self.metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + resolve_selection_policy_for_expensive_output( + plan_builder, + &read_projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + &read_projection, + row_count, + ), + ) + }); + } let row_group_info = RowGroupInfo { row_group_idx, From 61078c94f8590440bae8b16fe2a276c22c26a017 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sun, 17 May 2026 23:50:29 +0800 Subject: [PATCH 14/32] bench(parquet): add row filter cost model focus cases --- parquet/benches/arrow_reader_row_filter.rs | 318 ++++++++++++++------- 1 file changed, 212 insertions(+), 106 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 677059130205..9cdd46122c2a 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -60,7 +60,9 @@ use arrow::record_batch::RecordBatch; use arrow_array::StringViewArray; use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; use bytes::Bytes; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use criterion::{ + BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main, measurement::WallTime, +}; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt}; use parquet::arrow::arrow_reader::{ @@ -190,11 +192,17 @@ const ROW_GROUP_SIZE: usize = 100_000; /// Writes the RecordBatch to an in memory buffer, returning the buffer fn write_parquet_file() -> Vec { - let batch = create_record_batch(TOTAL_ROWS); + write_parquet_file_with_rows(TOTAL_ROWS, ROW_GROUP_SIZE) +} + +/// Writes a RecordBatch with a configurable shape to an in memory buffer, +/// returning the buffer. +fn write_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_record_batch(total_rows); let schema = batch.schema(); let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(ROW_GROUP_SIZE)) + .set_max_row_group_row_count(Some(row_group_size)) .build(); let mut buffer = vec![]; { @@ -211,6 +219,8 @@ fn write_parquet_file() -> Vec { enum ProjectionCase { AllColumns, ExcludeFilterColumn, + FilterColumnsOnly, + Utf8Only, } impl std::fmt::Display for ProjectionCase { @@ -218,6 +228,8 @@ impl std::fmt::Display for ProjectionCase { match self { ProjectionCase::AllColumns => write!(f, "all_columns"), ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), + ProjectionCase::FilterColumnsOnly => write!(f, "filter_columns_only"), + ProjectionCase::Utf8Only => write!(f, "utf8_only"), } } } @@ -369,6 +381,21 @@ enum FilterType { /// [ClickBench]: https://github.com/ClickHouse/ClickBench /// [Q21-Q27]: https://github.com/apache/datafusion/blob/b7177234e65cbbb2dcc04c252f6acd80bb026362/benchmarks/queries/clickbench/queries.sql#L22-L28 Utf8ViewNonEmpty, + /// Scalar-only part of ClickBench Q37: + /// + /// ```sql + /// WHERE CounterID = 62 + /// AND EventDate BETWEEN ... + /// AND DontCountHits = 0 + /// AND IsRefresh = 0 + /// AND Title <> '' + /// ``` + /// + /// DataFusion `Auto` does not push down the `Title <> ''` string predicate, + /// but it can push down the scalar prefix to defer decoding `Title`. + /// This synthetic predicate keeps that reader-level shape: cheap scalar + /// filter columns protect an expensive `Utf8View` output column. + ClickBenchQ37ScalarPrefix, } impl std::fmt::Display for FilterType { @@ -382,6 +409,7 @@ impl std::fmt::Display for FilterType { FilterType::UnselectiveClustered => "ts < 9000", FilterType::Composite => "float64 > 99.0 AND ts >= 9000", FilterType::Utf8ViewNonEmpty => "utf8View <> ''", + FilterType::ClickBenchQ37ScalarPrefix => "int64 == 62 AND ts < 9000", }; write!(f, "{s}") } @@ -436,6 +464,15 @@ impl FilterType { let scalar = StringViewArray::new_scalar(""); neq(array, &scalar) } + // ClickBenchQ37ScalarPrefix: a cheap fragmented scalar predicate + // evaluated before decoding a variable-width output column. + FilterType::ClickBenchQ37ScalarPrefix => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_match = eq(int64, &Int64Array::new_scalar(62))?; + let date_like_range = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&counter_match, &date_like_range) + } } } @@ -450,6 +487,7 @@ impl FilterType { FilterType::UnselectiveClustered => &[3], FilterType::Composite => &[1, 3], // Use float64 column and ts column as representative for composite FilterType::Utf8ViewNonEmpty => &[2], + FilterType::ClickBenchQ37ScalarPrefix => &[0, 3], } } } @@ -484,17 +522,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { for filter_type in filter_types { for proj_case in &projection_cases { - // All indices corresponding to the 10 columns. - let all_indices = vec![0, 1, 2, 3]; let filter_col = filter_type.filter_projection().to_vec(); - // For the projection, either select all columns or exclude the filter column(s). - let output_projection: Vec = match proj_case { - ProjectionCase::AllColumns => all_indices.clone(), - ProjectionCase::ExcludeFilterColumn => all_indices - .into_iter() - .filter(|i| !filter_col.contains(i)) - .collect(), - }; + let output_projection = output_projection_for(filter_type, proj_case); let reader = InMemoryReader::try_new(&parquet_file).unwrap(); let metadata = Arc::clone(reader.metadata()); @@ -756,39 +785,80 @@ fn benchmark_async_strategy_matrix(c: &mut Criterion) { /// [`benchmark_async_strategy_matrix`]: it keeps the benchmark output focused /// on cases where `Auto` should either switch to post-filter execution or /// explicitly keep predicate pushdown. +/// +/// The `profile_*` cases are derived from DataFusion ClickBench and TPC-DS +/// comparisons. They keep the reader-level shapes worth tracking while +/// excluding query regressions that did not construct a Parquet `RowFilter`. fn benchmark_async_cost_model_focus(c: &mut Criterion) { + const SMALL_TOTAL_ROWS: usize = 20_000; + const SMALL_ROW_GROUP_SIZE: usize = 5_000; + let parquet_file = Bytes::from(write_parquet_file()); + let small_parquet_file = Bytes::from(write_parquet_file_with_rows( + SMALL_TOTAL_ROWS, + SMALL_ROW_GROUP_SIZE, + )); let cases = [ - ( + AsyncFocusCase::new( "utf8_non_empty", + parquet_file.clone(), FilterType::Utf8ViewNonEmpty, ProjectionCase::ExcludeFilterColumn, ), - ( + AsyncFocusCase::new( "utf8_non_empty", + parquet_file.clone(), FilterType::Utf8ViewNonEmpty, ProjectionCase::AllColumns, ), - ( + AsyncFocusCase::new( "high_selectivity_float64", + parquet_file.clone(), FilterType::UnselectiveUnclustered, ProjectionCase::ExcludeFilterColumn, ), - ( + AsyncFocusCase::new( "high_selectivity_ts_clustered", + parquet_file.clone(), FilterType::UnselectiveClustered, ProjectionCase::ExcludeFilterColumn, ), - ( + AsyncFocusCase::new( "fragmented_int64_10pct", + parquet_file.clone(), FilterType::ModeratelySelectiveUnclustered, ProjectionCase::ExcludeFilterColumn, ), - ( + AsyncFocusCase::new( "selective_float64_1pct", + parquet_file.clone(), FilterType::SelectiveUnclustered, ProjectionCase::ExcludeFilterColumn, ), + AsyncFocusCase::new( + "profile_q37_scalar_utf8", + parquet_file.clone(), + FilterType::ClickBenchQ37ScalarPrefix, + ProjectionCase::Utf8Only, + ), + AsyncFocusCase::new( + "profile_q19_no_defer", + parquet_file, + FilterType::PointLookup, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_small_scalar_no_defer", + small_parquet_file.clone(), + FilterType::ModeratelySelectiveUnclustered, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_small_q37_scalar_utf8", + small_parquet_file, + FilterType::ClickBenchQ37ScalarPrefix, + ProjectionCase::Utf8Only, + ), ]; let strategies = [ AsyncStrategy::FullPostFilter, @@ -804,94 +874,138 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_reader_row_filter_async_cost_model_focus"); - for (case_name, filter_type, projection_case) in cases { - let reader = InMemoryReader::try_new(&parquet_file).unwrap(); - let metadata = Arc::clone(reader.metadata()); - let schema_descr = metadata.file_metadata().schema_descr(); - let output_projection = output_projection_for(filter_type, &projection_case); - let read_projection = full_post_filter_read_projection(filter_type, &output_projection); - let output_column_names = projection_names(&output_projection); - let projection_mask = ProjectionMask::roots(schema_descr, output_projection); - let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); - let pred_mask = ProjectionMask::roots( - schema_descr, - filter_type.filter_projection().iter().copied(), - ); + for case in cases { + benchmark_async_focus_case(&mut group, &rt, case, &strategies); + } +} - for strategy in strategies { - let bench_id = BenchmarkId::new( - format!("{case_name}/{projection_case}"), - strategy.to_string(), - ); - let rt_captured = rt.handle().clone(); +struct AsyncFocusCase { + case_name: &'static str, + parquet_file: Bytes, + filter_type: FilterType, + projection_case: ProjectionCase, +} - group.bench_function(bench_id, |b| { - b.iter(|| { - let reader = reader.clone(); - let pred_mask = pred_mask.clone(); - let projection_mask = projection_mask.clone(); - let read_projection_mask = read_projection_mask.clone(); - let output_column_names = output_column_names.clone(); +impl AsyncFocusCase { + fn new( + case_name: &'static str, + parquet_file: Bytes, + filter_type: FilterType, + projection_case: ProjectionCase, + ) -> Self { + Self { + case_name, + parquet_file, + filter_type, + projection_case, + } + } +} - rt_captured.block_on(async { - match strategy { - AsyncStrategy::FullPostFilter => { - benchmark_async_reader_post_filter( - reader, - read_projection_mask, - output_column_names, - filter_type, - ) - .await - } - AsyncStrategy::PushdownAutoCostModel => { - let row_filter = row_filter_for(filter_type, pred_mask); - benchmark_async_reader_with_policy( - reader, - projection_mask, - row_filter, - RowSelectionPolicy::default(), - ) - .await - } - AsyncStrategy::PushdownSelectors => { - let row_filter = row_filter_for(filter_type, pred_mask); - benchmark_async_reader_with_policy( - reader, - projection_mask, - row_filter, - RowSelectionPolicy::Selectors, - ) - .await - } - AsyncStrategy::PushdownMask => { - let row_filter = row_filter_for(filter_type, pred_mask); - benchmark_async_reader_with_policy( - reader, - projection_mask, - row_filter, - RowSelectionPolicy::Mask, - ) - .await - } +fn benchmark_async_focus_case( + group: &mut BenchmarkGroup<'_, WallTime>, + rt: &tokio::runtime::Runtime, + case: AsyncFocusCase, + strategies: &[AsyncStrategy], +) { + let AsyncFocusCase { + case_name, + parquet_file, + filter_type, + projection_case, + } = case; + + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies.iter().copied() { + let bench_id = BenchmarkId::new( + format!("{case_name}/{projection_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ) + .await } - }) - }); + AsyncStrategy::PushdownAutoCostModel => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + .await + } + } + }) }); - } + }); } } fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCase) -> Vec { let filter_columns = filter_type.filter_projection(); - COLUMN_NAMES - .iter() - .enumerate() - .map(|(idx, _)| idx) - .filter(move |idx| { - matches!(projection_case, ProjectionCase::AllColumns) || !filter_columns.contains(idx) - }) - .collect() + match projection_case { + ProjectionCase::AllColumns | ProjectionCase::ExcludeFilterColumn => COLUMN_NAMES + .iter() + .enumerate() + .map(|(idx, _)| idx) + .filter(move |idx| { + matches!(projection_case, ProjectionCase::AllColumns) + || !filter_columns.contains(idx) + }) + .collect(), + ProjectionCase::FilterColumnsOnly => filter_columns.to_vec(), + ProjectionCase::Utf8Only => vec![2], + } } fn full_post_filter_read_projection( @@ -1135,7 +1249,6 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { ProjectionCase::AllColumns, ProjectionCase::ExcludeFilterColumn, ]; - let all_indices = vec![0, 1, 2, 3]; let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -1147,14 +1260,7 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { for filter_type in filter_types { for proj_case in &projection_cases { let filter_col = filter_type.filter_projection().to_vec(); - let output_projection: Vec = match proj_case { - ProjectionCase::AllColumns => all_indices.clone(), - ProjectionCase::ExcludeFilterColumn => all_indices - .iter() - .copied() - .filter(|i| !filter_col.contains(i)) - .collect(), - }; + let output_projection = output_projection_for(filter_type, proj_case); let reader = InMemoryReader::try_new(&parquet_file).unwrap(); let metadata = Arc::clone(reader.metadata()); From 6a0c4f60a929c6a01982b51e481ddac575fea15d Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Tue, 19 May 2026 09:33:29 +0800 Subject: [PATCH 15/32] refactor(parquet): clarify reader cost model flow --- parquet/src/arrow/arrow_reader/metrics.rs | 9 - parquet/src/arrow/arrow_reader/post_filter.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 1119 +----------- .../arrow/arrow_reader/selection/strategy.rs | 269 +++ .../src/arrow/arrow_reader/selection/tests.rs | 887 +++++++++ .../push_decoder/reader_builder/cost_model.rs | 26 +- .../arrow/push_decoder/reader_builder/mod.rs | 1603 +++++++---------- .../reader_builder/selection_policy.rs | 359 ++++ 8 files changed, 2173 insertions(+), 2101 deletions(-) create mode 100644 parquet/src/arrow/arrow_reader/selection/strategy.rs create mode 100644 parquet/src/arrow/arrow_reader/selection/tests.rs diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index 0ca8cbf7a316..ccc57f7b0f9d 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -260,11 +260,6 @@ impl ArrowReaderMetrics { self.load(|inner| &inner.cost_model_post_filter_row_group_count) } - /// Cost model: number of times cost modeling was disabled by a forced policy - pub fn cost_model_forced_policy_count(&self) -> Option { - self.load(|inner| &inner.cost_model_forced_policy_count) - } - /// Cost model: number of incomplete observation-window decisions pub fn cost_model_observation_incomplete_count(&self) -> Option { self.load(|inner| &inner.cost_model_observation_incomplete_count) @@ -409,7 +404,6 @@ impl ArrowReaderMetrics { CostModelDecisionReason::PushdownStillPreferred => { &inner.cost_model_pushdown_still_preferred_count } - CostModelDecisionReason::ForcedPolicy => &inner.cost_model_forced_policy_count, }; counter.fetch_add(1, Ordering::Relaxed); } @@ -510,8 +504,6 @@ pub struct ArrowReaderMetricsInner { cost_model_pushdown_row_group_count: AtomicUsize, /// Number of row groups executed with post-filter cost_model_post_filter_row_group_count: AtomicUsize, - /// Number of cost-model decisions disabled by forced policy - cost_model_forced_policy_count: AtomicUsize, /// Number of incomplete cost-model observations cost_model_observation_incomplete_count: AtomicUsize, /// Number of cost-model decisions that kept pushdown @@ -551,7 +543,6 @@ impl ArrowReaderMetricsInner { cost_model_observed_row_group_count: AtomicUsize::new(0), cost_model_pushdown_row_group_count: AtomicUsize::new(0), cost_model_post_filter_row_group_count: AtomicUsize::new(0), - cost_model_forced_policy_count: AtomicUsize::new(0), cost_model_observation_incomplete_count: AtomicUsize::new(0), cost_model_pushdown_still_preferred_count: AtomicUsize::new(0), cost_model_high_selectivity_no_pruning_count: AtomicUsize::new(0), diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs index a6777a337368..d7d02774e05c 100644 --- a/parquet/src/arrow/arrow_reader/post_filter.rs +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -19,7 +19,7 @@ //! //! Normal predicate pushdown decodes predicate columns first, builds a //! `RowSelection`, and then decodes output columns for selected rows. The -//! The post-filter path in this module instead decodes the union of predicate and +//! post-filter path in this module instead decodes the union of predicate and //! output columns once and applies predicates after decode. //! //! ```text diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 7bb835aaac60..a7f45cba479b 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -15,6 +15,13 @@ // specific language governing permissions and limitations // under the License. +mod strategy; + +pub(crate) use strategy::{ + CostModelDecisionReason, CostModelObservation, RowGroupExecutionMode, RowSelectionShape, + RowSelectionStrategy, RowSelectionStrategyDecision, RowSelectionStrategyReason, +}; + use crate::errors::ParquetError; use crate::file::page_index::offset_index::PageLocation; use arrow_array::{Array, BooleanArray}; @@ -47,245 +54,6 @@ impl Default for RowSelectionPolicy { } } -/// Fully resolved strategy for materializing [`RowSelection`] during execution. -/// -/// This is determined from a combination of user preference (via [`RowSelectionPolicy`]) -/// and safety considerations (e.g. page skipping). -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum RowSelectionStrategy { - /// Use a queue of [`RowSelector`] values - Selectors, - /// Use a boolean mask to materialise the selection - Mask, -} - -#[allow(dead_code)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum RowGroupExecutionMode { - Pushdown(RowSelectionStrategy), - PostFilter, -} - -impl std::fmt::Display for RowGroupExecutionMode { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Pushdown(RowSelectionStrategy::Mask) => f.write_str("Pushdown(Mask)"), - Self::Pushdown(RowSelectionStrategy::Selectors) => f.write_str("Pushdown(Selectors)"), - Self::PostFilter => f.write_str("PostFilter"), - } - } -} - -/// Why a final row-selection read plan used masks or selectors. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum RowSelectionStrategyReason { - /// The caller explicitly requested masks. - ForcedMask, - /// The caller explicitly requested selectors. - ForcedSelectors, - /// Auto chose masks because the selection has no non-empty selectors. - AutoMaskEmptySelection, - /// Auto chose masks because average selector length is below the threshold. - AutoMaskShortRuns, - /// Auto chose masks because selected rows are fragmented into many short runs. - AutoMaskFragmentedSelection, - /// Auto chose masks because most rows are selected and selector skipping is unlikely to pay off. - AutoMaskHighSelectedRatio, - /// Auto chose selectors because selected rows are clustered into long runs. - AutoSelectorClusteredSelection, - /// Auto chose selectors because average selector length reaches the threshold. - AutoSelectorLongRuns, -} - -/// Shape summary for a [`RowSelection`]. -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub(crate) struct RowSelectionShape { - pub(crate) selected_rows: usize, - pub(crate) skipped_rows: usize, - pub(crate) selector_count: usize, - pub(crate) selected_run_count: usize, - pub(crate) skipped_run_count: usize, -} - -impl RowSelectionShape { - pub(crate) fn from_selection(selection: Option<&RowSelection>) -> Self { - let Some(selection) = selection else { - return Self::default(); - }; - - selection - .iter() - .fold(Self::default(), |mut shape, selector| { - if selector.row_count == 0 { - return shape; - } - - shape.selector_count += 1; - if selector.skip { - shape.skipped_rows += selector.row_count; - shape.skipped_run_count += 1; - } else { - shape.selected_rows += selector.row_count; - shape.selected_run_count += 1; - } - shape - }) - } - - pub(crate) fn total_rows(self) -> usize { - self.selected_rows + self.skipped_rows - } - - pub(crate) fn selected_ratio(self) -> f64 { - let total = self.total_rows(); - if total == 0 { - 0.0 - } else { - self.selected_rows as f64 / total as f64 - } - } - - #[allow(dead_code)] - pub(crate) fn run_density(self) -> f64 { - let total = self.total_rows(); - if total == 0 { - 0.0 - } else { - self.selector_count as f64 / total as f64 - } - } - - pub(crate) fn average_selected_run_length(self) -> f64 { - average_run_length(self.selected_rows, self.selected_run_count) - } - - pub(crate) fn average_skipped_run_length(self) -> f64 { - average_run_length(self.skipped_rows, self.skipped_run_count) - } - - pub(crate) fn add_assign(&mut self, other: Self) { - self.selected_rows += other.selected_rows; - self.skipped_rows += other.skipped_rows; - self.selector_count += other.selector_count; - self.selected_run_count += other.selected_run_count; - self.skipped_run_count += other.skipped_run_count; - } -} - -#[allow(dead_code)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum CostModelDecisionReason { - /// Predicate pushdown kept almost everything and did not produce useful pruning. - HighSelectivityNoPruning, - /// Fragmented runs with moderate selectivity often pay many small skip/read costs. - FragmentedModerateSelectivity, - /// Fragmented runs with high selectivity usually decode most rows plus pay pushdown overhead. - FragmentedHighSelectivity, - /// Not enough row groups have been observed to classify the scan. - ObservationIncomplete, - /// The observed shape still looks suitable for predicate pushdown. - PushdownStillPreferred, - /// The caller forced a concrete row-selection policy. - ForcedPolicy, -} - -/// Aggregate row-selection shape observed while deciding whether Auto should -/// continue predicate pushdown or switch to post-filter execution. -/// -/// The classifier looks for shapes where row-level pushdown is unlikely to -/// recover its own overhead: -/// -/// ```text -/// no skipped rows -> predicate did not prune -/// tiny selected runs + many runs -> fragmented skip/read pattern -/// high selected ratio -> most output rows are decoded anyway -/// ``` -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub(crate) struct CostModelObservation { - pub(crate) observed_row_groups: usize, - pub(crate) shape: RowSelectionShape, -} - -impl CostModelObservation { - pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; - const FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; - - pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { - if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { - return CostModelDecisionReason::ObservationIncomplete; - } - - let shape = self.shape; - if shape.total_rows() > 0 && shape.skipped_rows == 0 && shape.selected_ratio() >= 0.95 { - return CostModelDecisionReason::HighSelectivityNoPruning; - } - - let fragmented = shape.average_selected_run_length() <= 4.0 && shape.run_density() >= 0.01; - - if !fragmented { - return CostModelDecisionReason::PushdownStillPreferred; - } - - let selected_ratio = shape.selected_ratio(); - if (Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { - return CostModelDecisionReason::FragmentedModerateSelectivity; - } - if selected_ratio < 0.50 { - return CostModelDecisionReason::PushdownStillPreferred; - } - - CostModelDecisionReason::FragmentedHighSelectivity - } - - #[allow(dead_code)] - pub(crate) fn prefers_post_filter(self) -> bool { - matches!( - self.trigger_reason(), - CostModelDecisionReason::HighSelectivityNoPruning - | CostModelDecisionReason::FragmentedModerateSelectivity - | CostModelDecisionReason::FragmentedHighSelectivity - ) - } -} - -/// Fully resolved decision for materializing a [`RowSelection`]. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) struct RowSelectionStrategyDecision { - pub(crate) strategy: RowSelectionStrategy, - pub(crate) reason: RowSelectionStrategyReason, - pub(crate) shape: RowSelectionShape, -} - -impl RowSelectionStrategyDecision { - pub(crate) fn new( - strategy: RowSelectionStrategy, - reason: RowSelectionStrategyReason, - shape: RowSelectionShape, - ) -> Self { - Self { - strategy, - reason, - shape, - } - } - - pub(crate) fn with_shape(self, shape: RowSelectionShape) -> Self { - Self { shape, ..self } - } - - pub(crate) fn uses_mask(self) -> bool { - matches!(self.strategy, RowSelectionStrategy::Mask) - } -} - -fn average_run_length(rows: usize, runs: usize) -> f64 { - if runs == 0 { - 0.0 - } else { - rows as f64 / runs as f64 - } -} - /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when /// scanning a parquet file #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -1385,875 +1153,4 @@ fn boolean_mask_from_selectors(selectors: &[RowSelector]) -> BooleanBuffer { } #[cfg(test)] -mod tests { - use super::*; - use rand::{Rng, rng}; - - #[test] - fn test_loaded_row_ranges_detects_sparse_ranges() { - assert!(!LoadedRowRanges::new(std::iter::once(0..6).collect(), 6).is_sparse()); - assert!(!LoadedRowRanges::new(vec![], 0).is_sparse()); - assert!(LoadedRowRanges::new(vec![0..2, 4..6], 6).is_sparse()); - assert!(LoadedRowRanges::new(std::iter::once(1..6).collect(), 6).is_sparse()); - } - - #[test] - fn test_sparse_mask_cursor_skips_unloaded_ranges() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(4), - RowSelector::select(1), - ]); - - let loaded = LoadedRowRanges::new(vec![0..2, 4..6], 6); - let selectors: Vec = selection.into(); - let mut cursor = SparseMaskCursor::new(selectors, loaded); - - let chunk = cursor.next_sparse_mask_chunk(1024).unwrap().unwrap(); - assert_eq!(chunk.selected_rows, 2); - assert_eq!( - chunk.segments, - vec![ - MaskSegment { - row_range: 0..1, - mask_start: 0, - mask_len: 1, - }, - MaskSegment { - row_range: 5..6, - mask_start: 5, - mask_len: 1, - }, - ] - ); - assert!(cursor.is_empty()); - } - - #[test] - fn test_sparse_mask_cursor_errors_selected_rows_after_loaded_ranges() { - let selection = RowSelection::from(vec![RowSelector::skip(5), RowSelector::select(1)]); - - let loaded = LoadedRowRanges::new(std::iter::once(0..2).collect(), 6); - let selectors: Vec = selection.into(); - let mut cursor = SparseMaskCursor::new(selectors, loaded); - - let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); - assert!( - err.to_string() - .contains("sparse mask selected row 5 outside loaded row ranges"), - "{err}" - ); - } - - #[test] - fn test_sparse_mask_cursor_exhausts_empty_loaded_ranges() { - let selection = RowSelection::from(vec![RowSelector::select(6)]); - - let loaded = LoadedRowRanges::new(vec![], 6); - let selectors: Vec = selection.into(); - let mut cursor = SparseMaskCursor::new(selectors, loaded); - - let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); - assert!( - err.to_string() - .contains("sparse mask selected row 0 outside loaded row ranges"), - "{err}" - ); - } - - #[test] - fn test_from_filters() { - let filters = vec![ - BooleanArray::from(vec![false, false, false, true, true, true, true]), - BooleanArray::from(vec![true, true, false, false, true, true, true]), - BooleanArray::from(vec![false, false, false, false]), - BooleanArray::from(Vec::::new()), - ]; - - let selection = RowSelection::from_filters(&filters[..1]); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(3), RowSelector::select(4)] - ); - - let selection = RowSelection::from_filters(&filters[..2]); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(3), - RowSelector::select(6), - RowSelector::skip(2), - RowSelector::select(3) - ] - ); - - let selection = RowSelection::from_filters(&filters); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(3), - RowSelector::select(6), - RowSelector::skip(2), - RowSelector::select(3), - RowSelector::skip(4) - ] - ); - - let selection = RowSelection::from_filters(&filters[2..3]); - assert!(!selection.selects_any()); - assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); - } - - #[test] - fn test_split_off() { - let mut selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - let split = selection.split_off(34); - assert_eq!(split.selectors, vec![RowSelector::skip(34)]); - assert_eq!( - selection.selectors, - vec![ - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35) - ] - ); - - let split = selection.split_off(5); - assert_eq!(split.selectors, vec![RowSelector::select(5)]); - assert_eq!( - selection.selectors, - vec![ - RowSelector::select(7), - RowSelector::skip(3), - RowSelector::select(35) - ] - ); - - let split = selection.split_off(8); - assert_eq!( - split.selectors, - vec![RowSelector::select(7), RowSelector::skip(1)] - ); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(2), RowSelector::select(35)] - ); - - let split = selection.split_off(200); - assert_eq!( - split.selectors, - vec![RowSelector::skip(2), RowSelector::select(35)] - ); - assert!(selection.selectors.is_empty()); - } - - #[test] - fn test_offset() { - let selection = RowSelection::from(vec![ - RowSelector::select(5), - RowSelector::skip(23), - RowSelector::select(7), - RowSelector::skip(33), - RowSelector::select(6), - ]); - - let selection = selection.offset(2); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(2), - RowSelector::select(3), - RowSelector::skip(23), - RowSelector::select(7), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(5); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(30), - RowSelector::select(5), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(3); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(33), - RowSelector::select(2), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(2); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(68), RowSelector::select(6),] - ); - - let selection = selection.offset(3); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(71), RowSelector::select(3),] - ); - } - - #[test] - fn test_and() { - let mut a = RowSelection::from(vec![ - RowSelector::skip(12), - RowSelector::select(23), - RowSelector::skip(3), - RowSelector::select(5), - ]); - - let b = RowSelection::from(vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(15), - RowSelector::skip(4), - ]); - - let mut expected = RowSelection::from(vec![ - RowSelector::skip(12), - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(14), - RowSelector::skip(3), - RowSelector::select(1), - RowSelector::skip(4), - ]); - - assert_eq!(a.and_then(&b), expected); - - a.split_off(7); - expected.split_off(7); - assert_eq!(a.and_then(&b), expected); - - let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); - - let b = RowSelection::from(vec![ - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(1), - ]); - - assert_eq!( - a.and_then(&b).selectors, - vec![ - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(4) - ] - ); - } - - #[test] - fn test_combine() { - let a = vec![ - RowSelector::skip(3), - RowSelector::skip(3), - RowSelector::select(10), - RowSelector::skip(4), - ]; - - let b = vec![ - RowSelector::skip(3), - RowSelector::skip(3), - RowSelector::select(10), - RowSelector::skip(4), - RowSelector::skip(0), - ]; - - let c = vec![ - RowSelector::skip(2), - RowSelector::skip(4), - RowSelector::select(3), - RowSelector::select(3), - RowSelector::select(4), - RowSelector::skip(3), - RowSelector::skip(1), - RowSelector::skip(0), - ]; - - let expected = RowSelection::from(vec![ - RowSelector::skip(6), - RowSelector::select(10), - RowSelector::skip(4), - ]); - - assert_eq!(RowSelection::from_iter(a), expected); - assert_eq!(RowSelection::from_iter(b), expected); - assert_eq!(RowSelection::from_iter(c), expected); - } - - #[test] - fn test_combine_2elements() { - let a = vec![RowSelector::select(10), RowSelector::select(5)]; - let a_expect = vec![RowSelector::select(15)]; - assert_eq!(RowSelection::from_iter(a).selectors, a_expect); - - let b = vec![RowSelector::select(10), RowSelector::skip(5)]; - let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; - assert_eq!(RowSelection::from_iter(b).selectors, b_expect); - - let c = vec![RowSelector::skip(10), RowSelector::select(5)]; - let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; - assert_eq!(RowSelection::from_iter(c).selectors, c_expect); - - let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; - let d_expect = vec![RowSelector::skip(15)]; - assert_eq!(RowSelection::from_iter(d).selectors, d_expect); - } - - #[test] - fn test_from_one_and_empty() { - let a = vec![RowSelector::select(10)]; - let selection1 = RowSelection::from(a.clone()); - assert_eq!(selection1.selectors, a); - - let b = vec![]; - let selection1 = RowSelection::from(b.clone()); - assert_eq!(selection1.selectors, b) - } - - #[test] - #[should_panic(expected = "selection exceeds the number of selected rows")] - fn test_and_longer() { - let a = RowSelection::from(vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]); - let b = RowSelection::from(vec![RowSelector::select(36)]); - a.and_then(&b); - } - - #[test] - #[should_panic(expected = "selection contains less than the number of selected rows")] - fn test_and_shorter() { - let a = RowSelection::from(vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]); - let b = RowSelection::from(vec![RowSelector::select(3)]); - a.and_then(&b); - } - - #[test] - fn test_intersect_row_selection_and_combine() { - // a size equal b size - let a = vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(1), - ]; - let b = vec![ - RowSelector::select(8), - RowSelector::skip(1), - RowSelector::select(1), - ]; - - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(1), - ], - ); - - // a size larger than b size - let a = vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]; - let b = vec![RowSelector::select(36), RowSelector::skip(36)]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(3), RowSelector::skip(69)] - ); - - // a size less than b size - let a = vec![RowSelector::select(3), RowSelector::skip(7)]; - let b = vec![ - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - ]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(2), RowSelector::skip(8)] - ); - - let a = vec![RowSelector::select(3), RowSelector::skip(7)]; - let b = vec![ - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - ]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(2), RowSelector::skip(8)] - ); - } - - #[test] - fn test_and_fuzz() { - let mut rand = rng(); - for _ in 0..100 { - let a_len = rand.random_range(10..100); - let a_bools: Vec<_> = (0..a_len).map(|_| rand.random_bool(0.2)).collect(); - let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); - - let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); - let b_bools: Vec<_> = (0..b_len).map(|_| rand.random_bool(0.8)).collect(); - let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); - - let mut expected_bools = vec![false; a_len]; - - let mut iter_b = b_bools.iter(); - for (idx, b) in a_bools.iter().enumerate() { - if *b && *iter_b.next().unwrap() { - expected_bools[idx] = true; - } - } - - let expected = RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); - - let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); - assert_eq!(a_len, total_rows); - - assert_eq!(a.and_then(&b), expected); - } - } - - #[test] - fn test_iter() { - // use the iter() API to show it does what is expected and - // avoid accidental deletion - let selectors = vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(4), - ]; - - let round_tripped = RowSelection::from(selectors.clone()) - .iter() - .cloned() - .collect::>(); - assert_eq!(selectors, round_tripped); - } - - #[test] - fn test_limit() { - // Limit to existing limit should no-op - let selection = RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); - let limited = selection.limit(10); - assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); - - let selection = RowSelection::from(vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]); - - let limited = selection.clone().limit(5); - let expected = vec![RowSelector::select(5)]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(15); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(5), - ]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(0); - let expected = vec![]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(30); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]; - assert_eq!(limited.selectors, expected); - - let limited = selection.limit(100); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]; - assert_eq!(limited.selectors, expected); - } - - #[test] - fn test_scan_ranges() { - let index = vec![ - PageLocation { - offset: 0, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 10, - compressed_page_size: 10, - first_row_index: 10, - }, - PageLocation { - offset: 20, - compressed_page_size: 10, - first_row_index: 20, - }, - PageLocation { - offset: 30, - compressed_page_size: 10, - first_row_index: 30, - }, - PageLocation { - offset: 40, - compressed_page_size: 10, - first_row_index: 40, - }, - PageLocation { - offset: 50, - compressed_page_size: 10, - first_row_index: 50, - }, - PageLocation { - offset: 60, - compressed_page_size: 10, - first_row_index: 60, - }, - ]; - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select across page boundaries - RowSelector::select(12), - // Skip final page - RowSelector::skip(12), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, false]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select across page boundaries - RowSelector::select(12), - RowSelector::skip(1), - // Select across page boundaries including final page - RowSelector::select(8), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select to final page boundary - RowSelector::select(12), - RowSelector::skip(1), - // Skip across final page boundary - RowSelector::skip(8), - // Select from final page - RowSelector::select(4), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to remaining in page and first row of next page - RowSelector::skip(5), - RowSelector::select(6), - // Skip remaining - RowSelector::skip(50), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 30..40]); - } - - #[test] - fn test_selected_page_row_ranges() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(4), - RowSelector::select(1), - ]); - let pages = vec![ - PageLocation { - offset: 0, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 10, - compressed_page_size: 10, - first_row_index: 2, - }, - PageLocation { - offset: 20, - compressed_page_size: 10, - first_row_index: 4, - }, - ]; - - assert_eq!( - selection.selected_page_row_ranges(&pages, 6), - vec![0..2, 4..6] - ); - } - - #[test] - fn test_from_ranges() { - let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; - let selection = RowSelection::from_consecutive_ranges(ranges.into_iter(), 10); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(1), - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(2), - RowSelector::skip(3), - RowSelector::select(1) - ] - ); - - let out_of_order_ranges = [1..3, 8..10, 4..7]; - let result = std::panic::catch_unwind(|| { - RowSelection::from_consecutive_ranges(out_of_order_ranges.into_iter(), 10) - }); - assert!(result.is_err()); - } - - #[test] - fn test_empty_selector() { - let selection = RowSelection::from(vec![ - RowSelector::skip(0), - RowSelector::select(2), - RowSelector::skip(0), - RowSelector::select(2), - ]); - assert_eq!(selection.selectors, vec![RowSelector::select(4)]); - - let selection = RowSelection::from(vec![ - RowSelector::select(0), - RowSelector::skip(2), - RowSelector::select(0), - RowSelector::skip(2), - ]); - assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); - } - - #[test] - fn test_intersection() { - let selection = RowSelection::from(vec![RowSelector::select(1048576)]); - let result = selection.intersection(&selection); - assert_eq!(result, selection); - - let a = RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(20), - ]); - - let b = RowSelection::from(vec![ - RowSelector::skip(20), - RowSelector::select(20), - RowSelector::skip(10), - ]); - - let result = a.intersection(&b); - assert_eq!( - result.selectors, - vec![ - RowSelector::skip(30), - RowSelector::select(10), - RowSelector::skip(10) - ] - ); - } - - #[test] - fn test_union() { - let selection = RowSelection::from(vec![RowSelector::select(1048576)]); - let result = selection.union(&selection); - assert_eq!(result, selection); - - // NYNYY - let a = RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(20), - ]); - - // NNYYNYN - let b = RowSelection::from(vec![ - RowSelector::skip(20), - RowSelector::select(20), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - ]); - - let result = a.union(&b); - - // NYYYYYN - assert_eq!( - result.iter().collect::>(), - vec![ - &RowSelector::skip(10), - &RowSelector::select(50), - &RowSelector::skip(10), - ] - ); - } - - #[test] - fn test_row_count() { - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - assert_eq!(selection.row_count(), 12 + 35); - assert_eq!(selection.skipped_row_count(), 34 + 3); - - let selection = RowSelection::from(vec![RowSelector::select(12), RowSelector::select(35)]); - - assert_eq!(selection.row_count(), 12 + 35); - assert_eq!(selection.skipped_row_count(), 0); - - let selection = RowSelection::from(vec![RowSelector::skip(34), RowSelector::skip(3)]); - - assert_eq!(selection.row_count(), 0); - assert_eq!(selection.skipped_row_count(), 34 + 3); - - let selection = RowSelection::from(vec![]); - - assert_eq!(selection.row_count(), 0); - assert_eq!(selection.skipped_row_count(), 0); - } - - #[test] - fn test_trim() { - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - let expected = vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]; - - assert_eq!(selection.trim().selectors, expected); - - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - ]); - - let expected = vec![RowSelector::skip(34), RowSelector::select(12)]; - - assert_eq!(selection.trim().selectors, expected); - } -} +mod tests; diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs new file mode 100644 index 000000000000..795b5d0fd07c --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Concepts used to choose how a [`RowSelection`](super::RowSelection) is executed. +//! +//! The row-filter reader makes two related but separate decisions: +//! +//! ```text +//! RowSelection materialization: +//! RowSelectionPolicy::Auto --> Mask or Selectors +//! +//! Row-group execution: +//! Predicate pushdown --> decode predicates, build RowSelection, decode output +//! Post-filter --> decode output + predicates once, then filter +//! ``` +//! +//! This module keeps the vocabulary for those decisions in one place. The +//! low-level cursors live in `selection.rs`; the push decoder cost model and +//! metrics use the summaries here to explain why a plan was chosen. + +use super::RowSelection; + +/// Fully resolved strategy for materializing [`RowSelection`] during execution. +/// +/// This is determined from a combination of user preference (via +/// [`super::RowSelectionPolicy`]) and safety considerations (for example, page +/// pruning can force a sparse mask representation). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowSelectionStrategy { + /// Use a queue of [`super::RowSelector`] values. + Selectors, + /// Use a boolean mask to materialize the selection. + Mask, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowGroupExecutionMode { + Pushdown(RowSelectionStrategy), + PostFilter, +} + +impl std::fmt::Display for RowGroupExecutionMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pushdown(RowSelectionStrategy::Mask) => f.write_str("Pushdown(Mask)"), + Self::Pushdown(RowSelectionStrategy::Selectors) => f.write_str("Pushdown(Selectors)"), + Self::PostFilter => f.write_str("PostFilter"), + } + } +} + +/// Why a final row-selection read plan used masks or selectors. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowSelectionStrategyReason { + /// The caller explicitly requested masks. + ForcedMask, + /// The caller explicitly requested selectors. + ForcedSelectors, + /// Auto chose masks because the selection has no non-empty selectors. + AutoMaskEmptySelection, + /// Auto chose masks because average selector length is below the threshold. + AutoMaskShortRuns, + /// Auto chose masks because selected rows are fragmented into many short runs. + AutoMaskFragmentedSelection, + /// Auto chose masks because most rows are selected and selector skipping is unlikely to pay off. + AutoMaskHighSelectedRatio, + /// Auto chose selectors because selected rows are clustered into long runs. + AutoSelectorClusteredSelection, + /// Auto chose selectors because average selector length reaches the threshold. + AutoSelectorLongRuns, +} + +/// Shape summary for a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct RowSelectionShape { + pub(crate) selected_rows: usize, + pub(crate) skipped_rows: usize, + pub(crate) selector_count: usize, + pub(crate) selected_run_count: usize, + pub(crate) skipped_run_count: usize, +} + +impl RowSelectionShape { + pub(crate) fn from_selection(selection: Option<&RowSelection>) -> Self { + let Some(selection) = selection else { + return Self::default(); + }; + + selection + .iter() + .fold(Self::default(), |mut shape, selector| { + if selector.row_count == 0 { + return shape; + } + + shape.selector_count += 1; + if selector.skip { + shape.skipped_rows += selector.row_count; + shape.skipped_run_count += 1; + } else { + shape.selected_rows += selector.row_count; + shape.selected_run_count += 1; + } + shape + }) + } + + pub(crate) fn total_rows(self) -> usize { + self.selected_rows + self.skipped_rows + } + + pub(crate) fn selected_ratio(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selected_rows as f64 / total as f64 + } + } + + pub(crate) fn run_density(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selector_count as f64 / total as f64 + } + } + + pub(crate) fn average_selected_run_length(self) -> f64 { + average_run_length(self.selected_rows, self.selected_run_count) + } + + pub(crate) fn average_skipped_run_length(self) -> f64 { + average_run_length(self.skipped_rows, self.skipped_run_count) + } + + pub(crate) fn add_assign(&mut self, other: Self) { + self.selected_rows += other.selected_rows; + self.skipped_rows += other.skipped_rows; + self.selector_count += other.selector_count; + self.selected_run_count += other.selected_run_count; + self.skipped_run_count += other.skipped_run_count; + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum CostModelDecisionReason { + /// Predicate pushdown kept almost everything and did not produce useful pruning. + HighSelectivityNoPruning, + /// Fragmented runs with moderate selectivity often pay many small skip/read costs. + FragmentedModerateSelectivity, + /// Fragmented runs with high selectivity usually decode most rows plus pay pushdown overhead. + FragmentedHighSelectivity, + /// Not enough row groups have been observed to classify the scan. + ObservationIncomplete, + /// The observed shape still looks suitable for predicate pushdown. + PushdownStillPreferred, +} + +/// Aggregate row-selection shape observed while deciding whether Auto should +/// continue predicate pushdown or switch to post-filter execution. +/// +/// The classifier looks for shapes where row-level pushdown is unlikely to +/// recover its own overhead: +/// +/// ```text +/// no skipped rows -> predicate did not prune +/// tiny selected runs + many runs -> fragmented skip/read pattern +/// high selected ratio -> most output rows are decoded anyway +/// ``` +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct CostModelObservation { + pub(crate) observed_row_groups: usize, + pub(crate) shape: RowSelectionShape, +} + +impl CostModelObservation { + pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; + const FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; + + pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { + if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { + return CostModelDecisionReason::ObservationIncomplete; + } + + let shape = self.shape; + if shape.total_rows() > 0 && shape.skipped_rows == 0 && shape.selected_ratio() >= 0.95 { + return CostModelDecisionReason::HighSelectivityNoPruning; + } + + let fragmented = shape.average_selected_run_length() <= 4.0 && shape.run_density() >= 0.01; + + if !fragmented { + return CostModelDecisionReason::PushdownStillPreferred; + } + + let selected_ratio = shape.selected_ratio(); + if (Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { + return CostModelDecisionReason::FragmentedModerateSelectivity; + } + if selected_ratio < 0.50 { + return CostModelDecisionReason::PushdownStillPreferred; + } + + CostModelDecisionReason::FragmentedHighSelectivity + } + + pub(crate) fn prefers_post_filter(self) -> bool { + matches!( + self.trigger_reason(), + CostModelDecisionReason::HighSelectivityNoPruning + | CostModelDecisionReason::FragmentedModerateSelectivity + | CostModelDecisionReason::FragmentedHighSelectivity + ) + } +} + +/// Fully resolved decision for materializing a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct RowSelectionStrategyDecision { + pub(crate) strategy: RowSelectionStrategy, + pub(crate) reason: RowSelectionStrategyReason, + pub(crate) shape: RowSelectionShape, +} + +impl RowSelectionStrategyDecision { + pub(crate) fn new( + strategy: RowSelectionStrategy, + reason: RowSelectionStrategyReason, + shape: RowSelectionShape, + ) -> Self { + Self { + strategy, + reason, + shape, + } + } + + pub(crate) fn with_shape(self, shape: RowSelectionShape) -> Self { + Self { shape, ..self } + } + + pub(crate) fn uses_mask(self) -> bool { + matches!(self.strategy, RowSelectionStrategy::Mask) + } +} + +fn average_run_length(rows: usize, runs: usize) -> f64 { + if runs == 0 { + 0.0 + } else { + rows as f64 / runs as f64 + } +} diff --git a/parquet/src/arrow/arrow_reader/selection/tests.rs b/parquet/src/arrow/arrow_reader/selection/tests.rs new file mode 100644 index 000000000000..da1ca7ed1fa5 --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection/tests.rs @@ -0,0 +1,887 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::*; +use rand::{Rng, rng}; + +#[test] +fn test_loaded_row_ranges_detects_sparse_ranges() { + assert!(!LoadedRowRanges::new(std::iter::once(0..6).collect(), 6).is_sparse()); + assert!(!LoadedRowRanges::new(vec![], 0).is_sparse()); + assert!(LoadedRowRanges::new(vec![0..2, 4..6], 6).is_sparse()); + assert!(LoadedRowRanges::new(std::iter::once(1..6).collect(), 6).is_sparse()); +} + +#[test] +fn test_sparse_mask_cursor_skips_unloaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + + let loaded = LoadedRowRanges::new(vec![0..2, 4..6], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let chunk = cursor.next_sparse_mask_chunk(1024).unwrap().unwrap(); + assert_eq!(chunk.selected_rows, 2); + assert_eq!( + chunk.segments, + vec![ + MaskSegment { + row_range: 0..1, + mask_start: 0, + mask_len: 1, + }, + MaskSegment { + row_range: 5..6, + mask_start: 5, + mask_len: 1, + }, + ] + ); + assert!(cursor.is_empty()); +} + +#[test] +fn test_sparse_mask_cursor_errors_selected_rows_after_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::skip(5), RowSelector::select(1)]); + + let loaded = LoadedRowRanges::new(std::iter::once(0..2).collect(), 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 5 outside loaded row ranges"), + "{err}" + ); +} + +#[test] +fn test_sparse_mask_cursor_exhausts_empty_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::select(6)]); + + let loaded = LoadedRowRanges::new(vec![], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 0 outside loaded row ranges"), + "{err}" + ); +} + +#[test] +fn test_from_filters() { + let filters = vec![ + BooleanArray::from(vec![false, false, false, true, true, true, true]), + BooleanArray::from(vec![true, true, false, false, true, true, true]), + BooleanArray::from(vec![false, false, false, false]), + BooleanArray::from(Vec::::new()), + ]; + + let selection = RowSelection::from_filters(&filters[..1]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(3), RowSelector::select(4)] + ); + + let selection = RowSelection::from_filters(&filters[..2]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3) + ] + ); + + let selection = RowSelection::from_filters(&filters); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(4) + ] + ); + + let selection = RowSelection::from_filters(&filters[2..3]); + assert!(!selection.selects_any()); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); +} + +#[test] +fn test_split_off() { + let mut selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let split = selection.split_off(34); + assert_eq!(split.selectors, vec![RowSelector::skip(34)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(5); + assert_eq!(split.selectors, vec![RowSelector::select(5)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(7), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(8); + assert_eq!( + split.selectors, + vec![RowSelector::select(7), RowSelector::skip(1)] + ); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + + let split = selection.split_off(200); + assert_eq!( + split.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + assert!(selection.selectors.is_empty()); +} + +#[test] +fn test_offset() { + let selection = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ]); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(5); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(5), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(33), + RowSelector::select(2), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(68), RowSelector::select(6),] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(71), RowSelector::select(3),] + ); +} + +#[test] +fn test_and() { + let mut a = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(23), + RowSelector::skip(3), + RowSelector::select(5), + ]); + + let b = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(15), + RowSelector::skip(4), + ]); + + let mut expected = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(14), + RowSelector::skip(3), + RowSelector::select(1), + RowSelector::skip(4), + ]); + + assert_eq!(a.and_then(&b), expected); + + a.split_off(7); + expected.split_off(7); + assert_eq!(a.and_then(&b), expected); + + let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); + + let b = RowSelection::from(vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + + assert_eq!( + a.and_then(&b).selectors, + vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(4) + ] + ); +} + +#[test] +fn test_combine() { + let a = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + ]; + + let b = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + RowSelector::skip(0), + ]; + + let c = vec![ + RowSelector::skip(2), + RowSelector::skip(4), + RowSelector::select(3), + RowSelector::select(3), + RowSelector::select(4), + RowSelector::skip(3), + RowSelector::skip(1), + RowSelector::skip(0), + ]; + + let expected = RowSelection::from(vec![ + RowSelector::skip(6), + RowSelector::select(10), + RowSelector::skip(4), + ]); + + assert_eq!(RowSelection::from_iter(a), expected); + assert_eq!(RowSelection::from_iter(b), expected); + assert_eq!(RowSelection::from_iter(c), expected); +} + +#[test] +fn test_combine_2elements() { + let a = vec![RowSelector::select(10), RowSelector::select(5)]; + let a_expect = vec![RowSelector::select(15)]; + assert_eq!(RowSelection::from_iter(a).selectors, a_expect); + + let b = vec![RowSelector::select(10), RowSelector::skip(5)]; + let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; + assert_eq!(RowSelection::from_iter(b).selectors, b_expect); + + let c = vec![RowSelector::skip(10), RowSelector::select(5)]; + let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; + assert_eq!(RowSelection::from_iter(c).selectors, c_expect); + + let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; + let d_expect = vec![RowSelector::skip(15)]; + assert_eq!(RowSelection::from_iter(d).selectors, d_expect); +} + +#[test] +fn test_from_one_and_empty() { + let a = vec![RowSelector::select(10)]; + let selection1 = RowSelection::from(a.clone()); + assert_eq!(selection1.selectors, a); + + let b = vec![]; + let selection1 = RowSelection::from(b.clone()); + assert_eq!(selection1.selectors, b) +} + +#[test] +#[should_panic(expected = "selection exceeds the number of selected rows")] +fn test_and_longer() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(36)]); + a.and_then(&b); +} + +#[test] +#[should_panic(expected = "selection contains less than the number of selected rows")] +fn test_and_shorter() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(3)]); + a.and_then(&b); +} + +#[test] +fn test_intersect_row_selection_and_combine() { + // a size equal b size + let a = vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ]; + let b = vec![ + RowSelector::select(8), + RowSelector::skip(1), + RowSelector::select(1), + ]; + + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ], + ); + + // a size larger than b size + let a = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]; + let b = vec![RowSelector::select(36), RowSelector::skip(36)]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(3), RowSelector::skip(69)] + ); + + // a size less than b size + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); + + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); +} + +#[test] +fn test_and_fuzz() { + let mut rand = rng(); + for _ in 0..100 { + let a_len = rand.random_range(10..100); + let a_bools: Vec<_> = (0..a_len).map(|_| rand.random_bool(0.2)).collect(); + let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); + + let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); + let b_bools: Vec<_> = (0..b_len).map(|_| rand.random_bool(0.8)).collect(); + let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); + + let mut expected_bools = vec![false; a_len]; + + let mut iter_b = b_bools.iter(); + for (idx, b) in a_bools.iter().enumerate() { + if *b && *iter_b.next().unwrap() { + expected_bools[idx] = true; + } + } + + let expected = RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); + + let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); + assert_eq!(a_len, total_rows); + + assert_eq!(a.and_then(&b), expected); + } +} + +#[test] +fn test_iter() { + // use the iter() API to show it does what is expected and + // avoid accidental deletion + let selectors = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(4), + ]; + + let round_tripped = RowSelection::from(selectors.clone()) + .iter() + .cloned() + .collect::>(); + assert_eq!(selectors, round_tripped); +} + +#[test] +fn test_limit() { + // Limit to existing limit should no-op + let selection = RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); + let limited = selection.limit(10); + assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); + + let selection = RowSelection::from(vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]); + + let limited = selection.clone().limit(5); + let expected = vec![RowSelector::select(5)]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(15); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(5), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(0); + let expected = vec![]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(30); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.limit(100); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); +} + +#[test] +fn test_scan_ranges() { + let index = vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 10, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 20, + }, + PageLocation { + offset: 30, + compressed_page_size: 10, + first_row_index: 30, + }, + PageLocation { + offset: 40, + compressed_page_size: 10, + first_row_index: 40, + }, + PageLocation { + offset: 50, + compressed_page_size: 10, + first_row_index: 50, + }, + PageLocation { + offset: 60, + compressed_page_size: 10, + first_row_index: 60, + }, + ]; + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select across page boundaries + RowSelector::select(12), + // Skip final page + RowSelector::skip(12), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, false]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select across page boundaries + RowSelector::select(12), + RowSelector::skip(1), + // Select across page boundaries including final page + RowSelector::select(8), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select to final page boundary + RowSelector::select(12), + RowSelector::skip(1), + // Skip across final page boundary + RowSelector::skip(8), + // Select from final page + RowSelector::select(4), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to remaining in page and first row of next page + RowSelector::skip(5), + RowSelector::select(6), + // Skip remaining + RowSelector::skip(50), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 30..40]); +} + +#[test] +fn test_selected_page_row_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let pages = vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ]; + + assert_eq!( + selection.selected_page_row_ranges(&pages, 6), + vec![0..2, 4..6] + ); +} + +#[test] +fn test_from_ranges() { + let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; + let selection = RowSelection::from_consecutive_ranges(ranges.into_iter(), 10); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(3), + RowSelector::select(1) + ] + ); + + let out_of_order_ranges = [1..3, 8..10, 4..7]; + let result = std::panic::catch_unwind(|| { + RowSelection::from_consecutive_ranges(out_of_order_ranges.into_iter(), 10) + }); + assert!(result.is_err()); +} + +#[test] +fn test_empty_selector() { + let selection = RowSelection::from(vec![ + RowSelector::skip(0), + RowSelector::select(2), + RowSelector::skip(0), + RowSelector::select(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::select(4)]); + + let selection = RowSelection::from(vec![ + RowSelector::select(0), + RowSelector::skip(2), + RowSelector::select(0), + RowSelector::skip(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); +} + +#[test] +fn test_intersection() { + let selection = RowSelection::from(vec![RowSelector::select(1048576)]); + let result = selection.intersection(&selection); + assert_eq!(result, selection); + + let a = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(20), + ]); + + let b = RowSelection::from(vec![ + RowSelector::skip(20), + RowSelector::select(20), + RowSelector::skip(10), + ]); + + let result = a.intersection(&b); + assert_eq!( + result.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(10), + RowSelector::skip(10) + ] + ); +} + +#[test] +fn test_union() { + let selection = RowSelection::from(vec![RowSelector::select(1048576)]); + let result = selection.union(&selection); + assert_eq!(result, selection); + + // NYNYY + let a = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(20), + ]); + + // NNYYNYN + let b = RowSelection::from(vec![ + RowSelector::skip(20), + RowSelector::select(20), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + ]); + + let result = a.union(&b); + + // NYYYYYN + assert_eq!( + result.iter().collect::>(), + vec![ + &RowSelector::skip(10), + &RowSelector::select(50), + &RowSelector::skip(10), + ] + ); +} + +#[test] +fn test_row_count() { + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + assert_eq!(selection.row_count(), 12 + 35); + assert_eq!(selection.skipped_row_count(), 34 + 3); + + let selection = RowSelection::from(vec![RowSelector::select(12), RowSelector::select(35)]); + + assert_eq!(selection.row_count(), 12 + 35); + assert_eq!(selection.skipped_row_count(), 0); + + let selection = RowSelection::from(vec![RowSelector::skip(34), RowSelector::skip(3)]); + + assert_eq!(selection.row_count(), 0); + assert_eq!(selection.skipped_row_count(), 34 + 3); + + let selection = RowSelection::from(vec![]); + + assert_eq!(selection.row_count(), 0); + assert_eq!(selection.skipped_row_count(), 0); +} + +#[test] +fn test_trim() { + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let expected = vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]; + + assert_eq!(selection.trim().selectors, expected); + + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + ]); + + let expected = vec![RowSelector::skip(34), RowSelector::select(12)]; + + assert_eq!(selection.trim().selectors, expected); +} diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index cf4b440c1b4b..192e94423c0f 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -17,11 +17,14 @@ //! Runtime post-filter cost decisions for push decoder row groups. //! -//! The cost model is intentionally adaptive rather than purely static. The first -//! eligible row group is evaluated with predicate pushdown so the reader can -//! observe the actual `RowSelection` shape produced by the predicate chain. -//! Later row groups may then switch to post-filter execution if the observed -//! shape suggests pushdown is doing extra work without pruning enough rows. +//! The cost model is intentionally adaptive rather than purely static. There +//! are two ways to enter post-filter execution: +//! +//! * a narrow static rule starts there for variable-width predicate columns, +//! where building fragmented pushdown selections is commonly expensive +//! * the first eligible row group runs predicate pushdown, records the actual +//! `RowSelection` shape, and lets later row groups use post-filter if that +//! shape suggests pushdown is doing extra work without pruning enough rows //! //! ```text //! Start @@ -47,7 +50,6 @@ use crate::arrow::arrow_reader::selection::{ use crate::arrow::schema::{ParquetField, ParquetFieldType}; use crate::basic::Type as PhysicalType; -#[allow(dead_code)] #[derive(Debug)] pub(super) enum RowGroupCostModelState { /// Collect row-selection shape from early row groups before choosing a mode. @@ -55,7 +57,7 @@ pub(super) enum RowGroupCostModelState { /// Predicate pushdown remains the execution mode for this reader. UsePushdown, /// Later row groups should decode once and evaluate predicates after decode. - UsePostFilter { reason: CostModelDecisionReason }, + UsePostFilter, } impl Default for RowGroupCostModelState { @@ -75,10 +77,8 @@ impl RowGroupReaderBuilder { // predicates after decode changes where short-circuiting can happen. // * virtual columns are not read from Parquet pages and need their // existing projection path. - matches!( - self.cost_model_state, - RowGroupCostModelState::UsePostFilter { .. } - ) && self.post_filter_cost_model_enabled + matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) + && self.post_filter_cost_model_enabled && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && budget.is_unbounded() && !self.has_virtual_columns() @@ -108,7 +108,7 @@ impl RowGroupReaderBuilder { self.build_post_filter_read_projection(filter) } - pub(super) fn should_start_with_post_filter_for_predicate_cost( + pub(super) fn should_start_with_post_filter_for_variable_width_predicate( &self, filter: &RowFilter, row_group_idx: usize, @@ -208,7 +208,7 @@ impl RowGroupReaderBuilder { self.metrics.record_cost_model_trigger(reason); if prefers_post_filter && self.post_filter_cost_model_supported(budget) { - self.cost_model_state = RowGroupCostModelState::UsePostFilter { reason }; + self.cost_model_state = RowGroupCostModelState::UsePostFilter; } else { self.cost_model_state = RowGroupCostModelState::UsePushdown; } diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 26adcc2191f6..6aec47daf9d0 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -21,24 +21,20 @@ mod filter; mod selection_policy; use crate::arrow::ProjectionMask; -use crate::arrow::array_reader::{ArrayReaderBuilder, CacheOptions, RowGroupCache}; +use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder, CacheOptions, RowGroupCache}; use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; use crate::arrow::arrow_reader::selection::RowGroupExecutionMode; use crate::arrow::arrow_reader::{ ParquetRecordBatchReader, PredicateOptions, ReadPlanBuilder, RowFilter, RowSelection, RowSelectionPolicy, RowSelector, }; -use crate::arrow::in_memory_row_group::ColumnChunkData; +use crate::arrow::in_memory_row_group::{ColumnChunkData, InMemoryRowGroup}; use crate::arrow::push_decoder::reader_builder::cost_model::RowGroupCostModelState; use crate::arrow::push_decoder::reader_builder::data::DataRequestBuilder; use crate::arrow::push_decoder::reader_builder::filter::CacheInfo; use crate::arrow::push_decoder::reader_builder::selection_policy::{ ExpensiveOutputProfile, resolve_selection_policy_for_expensive_output, }; -#[cfg(test)] -use crate::arrow::push_decoder::reader_builder::selection_policy::{ - loaded_ranges_for_projection, resolve_selection_policy_for_projection, -}; use crate::arrow::schema::ParquetField; use crate::errors::ParquetError; use crate::file::metadata::ParquetMetaData; @@ -63,11 +59,38 @@ struct RowGroupInfo { enum CostModelTransition { ContinuePushdown, - StartPostSelection { selection: RowSelection }, - EnablePostFilter, + /// The current row group already evaluated predicates and produced a + /// selection, but Auto now prefers post-filter for this scan shape. Decode + /// the current row group's output once and apply the existing selection + /// after decode instead of evaluating predicates a second time. + StartPostSelection { + selection: RowSelection, + }, } /// This is the inner state machine for reading a single row group. +/// +/// The top-level flow is: +/// +/// ```text +/// Start +/// +-- no filter / no predicates ----------------------> StartData +/// +-- Auto chooses post-filter ------------------------> WaitingOnPostFilterData +/// +-- predicate pushdown ------------------------------> Filters +/// +/// Filters -> WaitingOnFilterData -> Filters | StartData +/// +/// StartData +/// +-- no rows after selection/limit -------------------> Finished +/// +-- output data needed ------------------------------> WaitingOnData +/// +/// WaitingOnData +/// +-- Auto switches current row group to post-selection > WaitingOnPostSelectionData +/// +-- output reader ready -----------------------------> Finished +/// ``` +/// +/// Each state arm delegates to a `transition_*` method so the dispatch table +/// remains readable before diving into the details for each phase. #[derive(Debug)] enum RowGroupDecoderState { Start { @@ -360,68 +383,6 @@ impl RowGroupReaderBuilder { self.post_filter_cost_model_enabled = false; } - fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { - if self.post_filter.is_some() { - return Ok(()); - } - - let filter = self.filter.take().ok_or_else(|| { - ParquetError::General( - "post-filter cost model selected without a row filter".to_string(), - ) - })?; - self.post_filter = Some(Arc::new(Mutex::new(filter))); - Ok(()) - } - - fn resolve_cost_model_transition( - &mut self, - row_group_info: &RowGroupInfo, - cache_info: Option<&CacheInfo>, - ) -> Result { - if cache_info.is_none() - || !matches!( - self.cost_model_state, - RowGroupCostModelState::Observing { .. } - ) - || !self.post_filter_cost_model_supported(row_group_info.budget) - { - return Ok(CostModelTransition::ContinuePushdown); - } - - let decision = row_group_info - .plan_builder - .resolve_selection_strategy_decision(); - let observed_selection = row_group_info.plan_builder.selection().cloned(); - - self.observe_cost_model_candidate( - decision, - row_group_info.row_count, - row_group_info.budget, - ); - - if matches!( - self.cost_model_state, - RowGroupCostModelState::UsePostFilter { .. } - ) { - if row_group_info.base_selection.is_none() { - let selection = observed_selection.unwrap_or_else(|| { - RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) - }); - return Ok(CostModelTransition::StartPostSelection { selection }); - } - - self.ensure_post_filter_state()?; - self.metrics - .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - return Ok(CostModelTransition::EnablePostFilter); - } - - self.metrics - .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); - Ok(CostModelTransition::ContinuePushdown) - } - /// take the current state, leaving None in its place. /// /// Returns an error if there the state wasn't put back after the previous @@ -517,575 +478,143 @@ impl RowGroupReaderBuilder { &mut self, current_state: RowGroupDecoderState, ) -> Result { - let result = match current_state { - RowGroupDecoderState::Start { row_group_info } => { - debug_assert!( - !row_group_info.budget.is_exhausted(), - "RowGroupFrontier should not hand off row groups after the output limit is exhausted" - ); - - let column_chunks = None; // no prior column chunks - - if let Some(filter) = self.post_filter.as_ref().cloned() { - return self.start_post_filter(row_group_info, filter); - } - - let Some(filter) = self.filter.take() else { - // no filter, start trying to read data immediately - return Ok(NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: None, - })); - }; - // no predicates in filter, so start reading immediately - if filter.predicates.is_empty() { - return Ok(NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: None, - })); - }; - - if self.should_start_with_post_filter_for_predicate_cost( - &filter, - row_group_info.row_group_idx, - row_group_info.budget, - ) { - let filter = Arc::new(Mutex::new(filter)); - self.post_filter = Some(Arc::clone(&filter)); - return self.start_post_filter(row_group_info, filter); - } - - if self.should_use_post_filter_by_cost(row_group_info.budget) { - if self - .post_filter_read_projection(&filter, row_group_info.budget) - .is_some() - { - let filter = Arc::new(Mutex::new(filter)); - self.post_filter = Some(Arc::clone(&filter)); - return self.start_post_filter(row_group_info, filter); - } - - self.cost_model_state = RowGroupCostModelState::UsePushdown; - } - - // we have predicates to evaluate - let cache_projection = - self.compute_cache_projection(row_group_info.row_group_idx, &filter); - - let cache_info = CacheInfo::new( - cache_projection, - Arc::new(RwLock::new(RowGroupCache::new( - self.batch_size, - self.max_predicate_cache_size, - ))), - ); - - let filter_info = FilterInfo::new(filter, cache_info); - NextState::again(RowGroupDecoderState::Filters { - row_group_info, - filter_info, - column_chunks, - }) - } - // need to evaluate filters + match current_state { + RowGroupDecoderState::Start { row_group_info } => self.transition_start(row_group_info), RowGroupDecoderState::Filters { row_group_info, column_chunks, filter_info, - } => { - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection, - budget, - } = row_group_info; - - // If nothing is selected, we are done with this row group - if !plan_builder.selects_any() { - // ruled out entire row group - self.filter = Some(filter_info.into_filter()); - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { - remaining_budget: budget, - }, - )); - } - - // Make a request for the data needed to evaluate the current predicate - let predicate = filter_info.current(); - - // need to fetch pages the column needs for decoding, figure - // that out based on the current selection and projection - let data_request = - self.metrics - .time_phase(ArrowReaderPhase::PredicateRangePlanning, || { - DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - predicate.projection(), // use the predicate's projection - ) - .with_selection(plan_builder.selection()) - // Fetch predicate columns; expand selection only for cached predicate columns - .with_cache_projection(Some(filter_info.cache_projection())) - .with_column_chunks(column_chunks) - .build() - }); - - let row_group_info = RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection, - budget, - }; - - NextState::again(RowGroupDecoderState::WaitingOnFilterData { - row_group_info, - filter_info, - data_request, - }) - } + } => self.transition_filters(row_group_info, column_chunks, filter_info), RowGroupDecoderState::WaitingOnFilterData { row_group_info, data_request, - mut filter_info, - } => { - // figure out what ranges we still need - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - // still need data - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnFilterData { - row_group_info, - filter_info, - data_request, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } - - // otherwise we have all the data we need to evaluate the predicate - let RowGroupInfo { - row_group_idx, - row_count, - mut plan_builder, - base_selection, - budget, - } = row_group_info; - - let predicate = filter_info.current(); - - let row_group = data_request.try_into_in_memory_row_group( - row_group_idx, - row_count, - &self.metadata, - predicate.projection(), - &mut self.buffers, - )?; - - let cache_options = filter_info.cache_builder().producer(); - - let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_cache_options(Some(&cache_options)) - .with_parquet_metadata(&self.metadata) - .build_array_reader(self.fields.as_deref(), predicate.projection())?; - - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - predicate.projection(), - self.row_group_offset_index(row_group_idx), - row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - predicate.projection(), - row_count, - ), - ); - - // When this is the final predicate in the chain and an output - // limit is set, tell the filter evaluation to stop once enough - // matching rows have been accumulated. - let predicate_limit = filter_info - .is_last() - .then(|| budget.selected_row_limit()) - .flatten(); - - // Evaluate the filter via `with_predicate_options`, opting into - // early termination when this is the final predicate and an - // output limit was set. - let mut predicate_options = - PredicateOptions::new(array_reader, filter_info.current_mut()) - .with_metrics(self.metrics.clone()); - if let Some(limit) = predicate_limit { - predicate_options = predicate_options.with_limit(limit, row_count); - } - plan_builder = plan_builder.with_predicate_options(predicate_options)?; - - let row_group_info = RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection, - budget, - }; - - // Take back the column chunks that were read - let column_chunks = Some(row_group.column_chunks); - - // advance to the next predicate, if any - match filter_info.advance() { - AdvanceResult::Continue(filter_info) => { - NextState::again(RowGroupDecoderState::Filters { - row_group_info, - column_chunks, - filter_info, - }) - } - // done with predicates, proceed to reading data - AdvanceResult::Done(filter, cache_info) => { - // remember we need to put back the filter - assert!(self.filter.is_none()); - self.filter = Some(filter); - NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: Some(cache_info), - }) - } - } - } + filter_info, + } => self.transition_waiting_on_filter_data(row_group_info, data_request, filter_info), RowGroupDecoderState::StartData { row_group_info, column_chunks, cache_info, - } => { - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection, - budget, - } = row_group_info; - - let BudgetedReadPlan { - mut plan_builder, - rows_before_budget, - rows_after_budget, - remaining_budget, - } = budget.apply_to_plan(plan_builder, row_count); - - if rows_before_budget == 0 { - // ruled out entire row group - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { remaining_budget }, - )); - } - - if rows_after_budget == 0 { - // no rows left after applying limit/offset - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { remaining_budget }, - )); - } - - let data_request = - self.metrics - .time_phase(ArrowReaderPhase::OutputRangePlanning, || { - DataRequestBuilder::new( - row_group_idx, - row_count, - self.batch_size, - &self.metadata, - &self.projection, - ) - .with_selection(plan_builder.selection()) - .with_column_chunks(column_chunks) - // Final projection fetch shouldn't expand selection for cache - // so don't call with_cache_projection here - .build() - }); - - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - plan_builder = - self.metrics - .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { - resolve_selection_policy_for_expensive_output( - plan_builder, - &self.projection, - self.row_group_offset_index(row_group_idx), - row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - &self.projection, - row_count, - ), - ) - }); - - let row_group_info = RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection, - budget: remaining_budget, - }; - - NextState::again(RowGroupDecoderState::WaitingOnData { - row_group_info, - data_request, - cache_info, - }) - } + } => self.transition_start_data(row_group_info, column_chunks, cache_info), RowGroupDecoderState::WaitingOnPostFilterData { row_group_info, data_request, read_projection, filter, - } => { - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnPostFilterData { - row_group_info, - data_request, - read_projection, - filter, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } - - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection: _, - budget, - } = row_group_info; - - let row_group = data_request.try_into_in_memory_row_group( - row_group_idx, - row_count, - &self.metadata, - &read_projection, - &mut self.buffers, - )?; - - let plan = plan_builder.build_with_metrics(&self.metrics); - let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_parquet_metadata(&self.metadata) - .build_array_reader(self.fields.as_deref(), &read_projection)?; - - let reader = ParquetRecordBatchReader::new_post_filter( - array_reader, - plan, - filter, - self.metadata.file_metadata().schema_descr(), - &read_projection, - &self.projection, - self.metrics.clone(), - )?; - - self.metrics - .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); - - NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Data { - batch_reader: Box::new(reader), - remaining_budget: budget, - }, - ) - } + } => self.transition_waiting_on_post_filter_data( + row_group_info, + data_request, + read_projection, + filter, + ), RowGroupDecoderState::WaitingOnPostSelectionData { row_group_info, data_request, selection, cache_info, - } => { - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnPostSelectionData { - row_group_info, - data_request, - selection, - cache_info, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } - - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection: _, - budget, - } = row_group_info; - - let row_group = data_request.try_into_in_memory_row_group( - row_group_idx, - row_count, - &self.metadata, - &self.projection, - &mut self.buffers, - )?; - - let plan = plan_builder.build_with_metrics(&self.metrics); - let array_reader_builder = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_parquet_metadata(&self.metadata); - let array_reader = if let Some(cache_info) = cache_info.as_ref() { - let cache_options: CacheOptions = cache_info.builder().consumer(); - array_reader_builder - .with_cache_options(Some(&cache_options)) - .build_array_reader(self.fields.as_deref(), &self.projection) - } else { - array_reader_builder - .build_array_reader(self.fields.as_deref(), &self.projection) - }?; - - let reader = ParquetRecordBatchReader::new_post_selection_filter( - array_reader, - plan, - selection, - self.metrics.clone(), - ); - - self.metrics - .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); - - NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Data { - batch_reader: Box::new(reader), - remaining_budget: budget, - }, - ) - } - // Waiting on data to proceed with reading the output + } => self.transition_waiting_on_post_selection_data( + row_group_info, + data_request, + selection, + cache_info, + ), RowGroupDecoderState::WaitingOnData { row_group_info, data_request, cache_info, - } => { - match self.resolve_cost_model_transition(&row_group_info, cache_info.as_ref())? { - CostModelTransition::ContinuePushdown - | CostModelTransition::EnablePostFilter => {} - CostModelTransition::StartPostSelection { selection } => { - let column_chunks = data_request.into_dense_column_chunks(); - // The current row group already computed a pushdown selection. Apply that - // selection after decode instead of evaluating the predicates again. - // - // Sparse predicate chunks may not cover the base selection. Dense chunks - // are safe to reuse and preserve predicate-cache IO behavior. - return self.start_post_selection_filter( - row_group_info, - selection, - cache_info, - column_chunks, - ); - } - } + } => self.transition_waiting_on_data(row_group_info, data_request, cache_info), + RowGroupDecoderState::Finished => Err(ParquetError::General(String::from( + "Internal Error: try_build called without an active row group", + ))), + } + } - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - // still need data - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnData { - row_group_info, - data_request, - cache_info, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } + fn transition_start( + &mut self, + row_group_info: RowGroupInfo, + ) -> Result { + debug_assert!( + !row_group_info.budget.is_exhausted(), + "RowGroupFrontier should not hand off row groups after the output limit is exhausted" + ); - // otherwise we have all the data we need to proceed - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - base_selection: _, - budget, - } = row_group_info; + let column_chunks = None; - let row_group = data_request.try_into_in_memory_row_group( - row_group_idx, - row_count, - &self.metadata, - &self.projection, - &mut self.buffers, - )?; - - let plan = plan_builder.build_with_metrics(&self.metrics); - - // if we have any cached results, connect them up - let array_reader_builder = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_parquet_metadata(&self.metadata); - let array_reader = if let Some(cache_info) = cache_info.as_ref() { - let cache_options: CacheOptions = cache_info.builder().consumer(); - array_reader_builder - .with_cache_options(Some(&cache_options)) - .build_array_reader(self.fields.as_deref(), &self.projection) - } else { - array_reader_builder - .build_array_reader(self.fields.as_deref(), &self.projection) - }?; - - let reader = ParquetRecordBatchReader::new_with_metrics( - array_reader, - plan, - self.metrics.clone(), - ); - NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Data { - batch_reader: Box::new(reader), - remaining_budget: budget, - }, - ) - } - RowGroupDecoderState::Finished => { - return Err(ParquetError::General(String::from( - "Internal Error: try_build called without an active row group", - ))); - } + if let Some(filter) = self.post_filter.as_ref().cloned() { + return self.start_post_filter(row_group_info, filter); + } + + let Some(filter) = self.filter.take() else { + return Ok(NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: None, + })); }; - Ok(result) + + if filter.predicates.is_empty() { + return Ok(NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: None, + })); + }; + + if self.should_start_with_post_filter_for_variable_width_predicate( + &filter, + row_group_info.row_group_idx, + row_group_info.budget, + ) { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + return self.start_post_filter(row_group_info, filter); + } + + if self.should_use_post_filter_by_cost(row_group_info.budget) { + if self + .post_filter_read_projection(&filter, row_group_info.budget) + .is_some() + { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + return self.start_post_filter(row_group_info, filter); + } + + self.cost_model_state = RowGroupCostModelState::UsePushdown; + } + + let cache_projection = self.compute_cache_projection(row_group_info.row_group_idx, &filter); + let cache_info = CacheInfo::new( + cache_projection, + Arc::new(RwLock::new(RowGroupCache::new( + self.batch_size, + self.max_predicate_cache_size, + ))), + ); + let filter_info = FilterInfo::new(filter, cache_info); + + Ok(NextState::again(RowGroupDecoderState::Filters { + row_group_info, + filter_info, + column_chunks, + })) } - fn start_post_filter( + fn transition_filters( &mut self, row_group_info: RowGroupInfo, - filter: Arc>, + column_chunks: Option>>>, + filter_info: FilterInfo, ) -> Result { let RowGroupInfo { row_group_idx, row_count, + plan_builder, base_selection, budget, - .. } = row_group_info; - let mut plan_builder = ReadPlanBuilder::new(self.batch_size) - .with_selection(base_selection) - .with_row_selection_policy(self.row_selection_policy); - if !plan_builder.selects_any() { + self.filter = Some(filter_info.into_filter()); return Ok(NextState::result( RowGroupDecoderState::Finished, RowGroupBuildResult::Finished { @@ -1094,20 +623,517 @@ impl RowGroupReaderBuilder { )); } - let read_projection = { - let filter = filter.lock().map_err(|_| { - ParquetError::General("post-filter predicate state was poisoned".to_string()) - })?; - self.post_filter_read_projection_for_filter(&filter, budget) - .ok_or_else(|| { - ParquetError::General( - "post-filter cost model selected an unsupported projection".to_string(), + let predicate = filter_info.current(); + let data_request = + self.metrics + .time_phase(ArrowReaderPhase::PredicateRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + predicate.projection(), ) - })? + .with_selection(plan_builder.selection()) + .with_cache_projection(Some(filter_info.cache_projection())) + .with_column_chunks(column_chunks) + .build() + }); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, }; - let data_request = self - .metrics + Ok(NextState::again( + RowGroupDecoderState::WaitingOnFilterData { + row_group_info, + filter_info, + data_request, + }, + )) + } + + fn transition_waiting_on_filter_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + mut filter_info: FilterInfo, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnFilterData { + row_group_info, + filter_info, + data_request, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + mut plan_builder, + base_selection, + budget, + } = row_group_info; + + let predicate = filter_info.current(); + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + predicate.projection(), + &mut self.buffers, + )?; + + let cache_options = filter_info.cache_builder().producer(); + let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_cache_options(Some(&cache_options)) + .with_parquet_metadata(&self.metadata) + .build_array_reader(self.fields.as_deref(), predicate.projection())?; + + plan_builder = self.resolve_output_selection_policy( + plan_builder, + predicate.projection(), + row_group_idx, + row_count, + ); + + let predicate_limit = filter_info + .is_last() + .then(|| budget.selected_row_limit()) + .flatten(); + let mut predicate_options = PredicateOptions::new(array_reader, filter_info.current_mut()) + .with_metrics(self.metrics.clone()); + if let Some(limit) = predicate_limit { + predicate_options = predicate_options.with_limit(limit, row_count); + } + plan_builder = plan_builder.with_predicate_options(predicate_options)?; + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + }; + let column_chunks = Some(row_group.column_chunks); + + Ok(match filter_info.advance() { + AdvanceResult::Continue(filter_info) => { + NextState::again(RowGroupDecoderState::Filters { + row_group_info, + column_chunks, + filter_info, + }) + } + AdvanceResult::Done(filter, cache_info) => { + assert!(self.filter.is_none()); + self.filter = Some(filter); + NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: Some(cache_info), + }) + } + }) + } + + fn transition_start_data( + &mut self, + row_group_info: RowGroupInfo, + column_chunks: Option>>>, + cache_info: Option, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + } = row_group_info; + + let BudgetedReadPlan { + mut plan_builder, + rows_before_budget, + rows_after_budget, + remaining_budget, + } = budget.apply_to_plan(plan_builder, row_count); + + if rows_before_budget == 0 || rows_after_budget == 0 { + return Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Finished { remaining_budget }, + )); + } + + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, + self.batch_size, + &self.metadata, + &self.projection, + ) + .with_selection(plan_builder.selection()) + .with_column_chunks(column_chunks) + // Final projection fetch shouldn't expand selection for cache + // so don't call with_cache_projection here. + .build() + }); + + plan_builder = self + .metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + self.resolve_output_selection_policy( + plan_builder, + &self.projection, + row_group_idx, + row_count, + ) + }); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget: remaining_budget, + }; + + Ok(NextState::again(RowGroupDecoderState::WaitingOnData { + row_group_info, + data_request, + cache_info, + })) + } + + fn transition_waiting_on_post_filter_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + read_projection: ProjectionMask, + filter: Arc>, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostFilterData { + row_group_info, + data_request, + read_projection, + filter, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &read_projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata) + .build_array_reader(self.fields.as_deref(), &read_projection)?; + let reader = ParquetRecordBatchReader::new_post_filter( + array_reader, + plan, + filter, + self.metadata.file_metadata().schema_descr(), + &read_projection, + &self.projection, + self.metrics.clone(), + )?; + + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } + + fn transition_waiting_on_post_selection_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + selection: RowSelection, + cache_info: Option, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &self.projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = self.build_projection_reader(&row_group, cache_info.as_ref())?; + let reader = ParquetRecordBatchReader::new_post_selection_filter( + array_reader, + plan, + selection, + self.metrics.clone(), + ); + + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } + + fn transition_waiting_on_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + cache_info: Option, + ) -> Result { + match self.resolve_cost_model_transition(&row_group_info, cache_info.as_ref())? { + CostModelTransition::ContinuePushdown => {} + CostModelTransition::StartPostSelection { selection } => { + let column_chunks = data_request.into_dense_column_chunks(); + return self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + column_chunks, + ); + } + } + + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnData { + row_group_info, + data_request, + cache_info, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &self.projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = self.build_projection_reader(&row_group, cache_info.as_ref())?; + let reader = + ParquetRecordBatchReader::new_with_metrics(array_reader, plan, self.metrics.clone()); + + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } + + fn resolve_cost_model_transition( + &mut self, + row_group_info: &RowGroupInfo, + cache_info: Option<&CacheInfo>, + ) -> Result { + if cache_info.is_none() + || !matches!( + self.cost_model_state, + RowGroupCostModelState::Observing { .. } + ) + || !self.post_filter_cost_model_supported(row_group_info.budget) + { + return Ok(CostModelTransition::ContinuePushdown); + } + + let decision = row_group_info + .plan_builder + .resolve_selection_strategy_decision(); + let observed_selection = row_group_info.plan_builder.selection().cloned(); + + self.observe_cost_model_candidate( + decision, + row_group_info.row_count, + row_group_info.budget, + ); + + if matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) { + if row_group_info.base_selection.is_none() { + let selection = observed_selection.unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) + }); + return Ok(CostModelTransition::StartPostSelection { selection }); + } + + self.ensure_post_filter_state()?; + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + // This row group was already planned with a base selection, so keep + // its current pushdown path. The state above enables post-filter + // execution for later row groups. + return Ok(CostModelTransition::ContinuePushdown); + } + + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + Ok(CostModelTransition::ContinuePushdown) + } + + fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { + if self.post_filter.is_some() { + return Ok(()); + } + + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General( + "post-filter cost model selected without a row filter".to_string(), + ) + })?; + self.post_filter = Some(Arc::new(Mutex::new(filter))); + Ok(()) + } + + fn resolve_output_selection_policy( + &self, + plan_builder: ReadPlanBuilder, + projection: &ProjectionMask, + row_group_idx: usize, + row_count: usize, + ) -> ReadPlanBuilder { + resolve_selection_policy_for_expensive_output( + plan_builder.with_row_selection_policy(self.row_selection_policy), + projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + projection, + row_count, + ), + ) + } + + fn build_projection_reader( + &self, + row_group: &InMemoryRowGroup<'_>, + cache_info: Option<&CacheInfo>, + ) -> Result, ParquetError> { + let array_reader_builder = ArrayReaderBuilder::new(row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata); + + if let Some(cache_info) = cache_info { + let cache_options: CacheOptions = cache_info.builder().consumer(); + array_reader_builder + .with_cache_options(Some(&cache_options)) + .build_array_reader(self.fields.as_deref(), &self.projection) + } else { + array_reader_builder.build_array_reader(self.fields.as_deref(), &self.projection) + } + } + + fn start_post_filter( + &mut self, + row_group_info: RowGroupInfo, + filter: Arc>, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + base_selection, + budget, + .. + } = row_group_info; + + let mut plan_builder = ReadPlanBuilder::new(self.batch_size) + .with_selection(base_selection) + .with_row_selection_policy(self.row_selection_policy); + + if !plan_builder.selects_any() { + return Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Finished { + remaining_budget: budget, + }, + )); + } + + let read_projection = { + let filter = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + self.post_filter_read_projection_for_filter(&filter, budget) + .ok_or_else(|| { + ParquetError::General( + "post-filter cost model selected an unsupported projection".to_string(), + ) + })? + }; + + let data_request = self + .metrics .time_phase(ArrowReaderPhase::OutputRangePlanning, || { DataRequestBuilder::new( row_group_idx, @@ -1121,20 +1147,14 @@ impl RowGroupReaderBuilder { }); if plan_builder.selection().is_some() { - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); plan_builder = self.metrics .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { - resolve_selection_policy_for_expensive_output( + self.resolve_output_selection_policy( plan_builder, &read_projection, - self.row_group_offset_index(row_group_idx), + row_group_idx, row_count, - ExpensiveOutputProfile::from_row_group( - self.metadata.row_group(row_group_idx), - &read_projection, - row_count, - ), ) }); } @@ -1252,357 +1272,6 @@ impl RowGroupReaderBuilder { #[cfg(test)] mod tests { use super::*; - use crate::arrow::arrow_reader::selection::LoadedRowRanges; - use crate::arrow::arrow_reader::{RowSelection, RowSelectionCursor, RowSelector}; - use crate::file::page_index::offset_index::PageLocation; - - #[test] - fn test_resolve_selection_policy_preserves_mask_choice() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(99), - RowSelector::select(1), - ]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - - assert_eq!( - resolve_selection_policy_for_projection( - plan_builder, - &ProjectionMask::all(), - None, - 101 - ) - .row_selection_policy(), - &RowSelectionPolicy::Mask - ); - } - - #[test] - fn test_resolve_selection_policy_preserves_selector_choice() { - let selection = RowSelection::from(vec![RowSelector::select(128)]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); - - assert_eq!( - resolve_selection_policy_for_projection( - plan_builder, - &ProjectionMask::all(), - None, - 128 - ) - .row_selection_policy(), - &RowSelectionPolicy::Selectors - ); - } - - #[test] - fn test_resolve_selection_policy_respects_explicit_policy() { - let selection = RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]); - let mask_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection.clone())) - .with_row_selection_policy(RowSelectionPolicy::Mask); - let selector_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Selectors); - - assert_eq!( - resolve_selection_policy_for_projection(mask_builder, &ProjectionMask::all(), None, 2) - .row_selection_policy(), - &RowSelectionPolicy::Mask - ); - assert_eq!( - resolve_selection_policy_for_projection( - selector_builder, - &ProjectionMask::all(), - None, - 2 - ) - .row_selection_policy(), - &RowSelectionPolicy::Selectors - ); - } - - #[test] - fn test_auto_sparse_loaded_ranges_force_selectors() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(4), - RowSelector::select(1), - ]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let offset_index = sparse_test_offset_index(); - - let plan_builder = resolve_selection_policy_for_projection( - plan_builder, - &ProjectionMask::all(), - Some(&offset_index), - 6, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Selectors - ); - } - - #[test] - fn test_auto_dense_loaded_ranges_preserve_mask() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(1), - ]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let offset_index = sparse_test_offset_index(); - - let plan_builder = resolve_selection_policy_for_projection( - plan_builder, - &ProjectionMask::all(), - Some(&offset_index), - 6, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Mask - ); - } - - #[test] - fn test_explicit_mask_keeps_sparse_loaded_ranges() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(4), - RowSelector::select(1), - ]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Mask); - let offset_index = sparse_test_offset_index(); - - let plan_builder = resolve_selection_policy_for_projection( - plan_builder, - &ProjectionMask::all(), - Some(&offset_index), - 6, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Mask - ); - - let mut plan = plan_builder.build(); - let RowSelectionCursor::Mask(cursor) = plan.row_selection_cursor_mut() else { - panic!("expected mask cursor"); - }; - assert!(cursor.is_sparse()); - } - - #[test] - fn test_loaded_ranges_intersects_many_ranges_across_projected_columns() { - let selection = RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(1), - RowSelector::skip(39), - RowSelector::select(1), - RowSelector::skip(39), - RowSelector::select(1), - RowSelector::skip(9), - ]); - let offset_index = vec![ - offset_index_column(&[0, 20, 40, 60, 80]), - offset_index_column(&[0, 15, 35, 55, 75]), - offset_index_column(&[0, 10, 30, 50, 70, 90]), - ]; - - let loaded = loaded_ranges_for_projection( - Some(&selection), - &ProjectionMask::all(), - Some(&offset_index), - 100, - ); - - assert_eq!( - loaded, - Some(LoadedRowRanges::new(vec![10..15, 50..55, 90..100], 100)) - ); - } - - #[test] - fn test_auto_expensive_fragmented_output_prefers_selectors() { - let selection = q38_like_fragmented_selection(); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let profile = ExpensiveOutputProfile { - variable_width_columns: 1, - uncompressed_bytes_per_row: 64.0, - }; - - let plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &ProjectionMask::all(), - None, - 7_800, - profile, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Selectors - ); - } - - #[test] - fn test_auto_expensive_fragmented_output_prefers_selectors_without_selector_count_gate() { - let selection = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(12), - RowSelector::select(1), - RowSelector::skip(12), - RowSelector::select(1), - RowSelector::skip(12), - RowSelector::select(1), - RowSelector::skip(12), - ]); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let profile = ExpensiveOutputProfile { - variable_width_columns: 1, - uncompressed_bytes_per_row: 64.0, - }; - - let plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &ProjectionMask::all(), - None, - 52, - profile, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Selectors - ); - } - - #[test] - fn test_auto_cheap_fragmented_output_keeps_mask() { - let selection = q38_like_fragmented_selection(); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let profile = ExpensiveOutputProfile { - variable_width_columns: 1, - uncompressed_bytes_per_row: 8.0, - }; - - let plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &ProjectionMask::all(), - None, - 7_800, - profile, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Mask - ); - } - - #[test] - fn test_auto_moderate_selectivity_expensive_output_keeps_mask() { - let selection = q26_like_fragmented_selection(); - let plan_builder = ReadPlanBuilder::new(1024) - .with_selection(Some(selection)) - .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); - let profile = ExpensiveOutputProfile { - variable_width_columns: 1, - uncompressed_bytes_per_row: 64.0, - }; - - let plan_builder = resolve_selection_policy_for_expensive_output( - plan_builder, - &ProjectionMask::all(), - None, - 7_200, - profile, - ); - - assert_eq!( - plan_builder.row_selection_policy(), - &RowSelectionPolicy::Mask - ); - } - - fn q38_like_fragmented_selection() -> RowSelection { - let mut selectors = Vec::new(); - for _ in 0..600 { - selectors.push(RowSelector::select(1)); - selectors.push(RowSelector::skip(12)); - } - RowSelection::from(selectors) - } - - fn q26_like_fragmented_selection() -> RowSelection { - let mut selectors = Vec::new(); - for _ in 0..600 { - selectors.push(RowSelector::select(2)); - selectors.push(RowSelector::skip(10)); - } - RowSelection::from(selectors) - } - - fn sparse_test_offset_index() -> Vec { - vec![OffsetIndexMetaData { - page_locations: vec![ - PageLocation { - offset: 0, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 10, - compressed_page_size: 10, - first_row_index: 2, - }, - PageLocation { - offset: 20, - compressed_page_size: 10, - first_row_index: 4, - }, - ], - unencoded_byte_array_data_bytes: None, - }] - } - - fn offset_index_column(first_rows: &[i64]) -> OffsetIndexMetaData { - OffsetIndexMetaData { - page_locations: first_rows - .iter() - .enumerate() - .map(|(idx, first_row_index)| PageLocation { - offset: (idx * 10) as i64, - compressed_page_size: 10, - first_row_index: *first_row_index, - }) - .collect(), - unencoded_byte_array_data_bytes: None, - } - } #[test] // Verify that the size of RowGroupDecoderState does not grow too large diff --git a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs index a17b6d320741..f3092d3e9fd1 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs @@ -238,3 +238,362 @@ fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { } out } + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::ProjectionMask; + use crate::arrow::arrow_reader::selection::LoadedRowRanges; + use crate::arrow::arrow_reader::{ + ReadPlanBuilder, RowSelection, RowSelectionCursor, RowSelectionPolicy, RowSelector, + }; + use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; + + #[test] + fn test_resolve_selection_policy_preserves_mask_choice() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(99), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 101 + ) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_resolve_selection_policy_preserves_selector_choice() { + let selection = RowSelection::from(vec![RowSelector::select(128)]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 128 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_resolve_selection_policy_respects_explicit_policy() { + let selection = RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]); + let mask_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection.clone())) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let selector_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Selectors); + + assert_eq!( + resolve_selection_policy_for_projection(mask_builder, &ProjectionMask::all(), None, 2) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + assert_eq!( + resolve_selection_policy_for_projection( + selector_builder, + &ProjectionMask::all(), + None, + 2 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_sparse_loaded_ranges_force_selectors() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_dense_loaded_ranges_preserve_mask() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_explicit_mask_keeps_sparse_loaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + + let mut plan = plan_builder.build(); + let RowSelectionCursor::Mask(cursor) = plan.row_selection_cursor_mut() else { + panic!("expected mask cursor"); + }; + assert!(cursor.is_sparse()); + } + + #[test] + fn test_loaded_ranges_intersects_many_ranges_across_projected_columns() { + let selection = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(9), + ]); + let offset_index = vec![ + offset_index_column(&[0, 20, 40, 60, 80]), + offset_index_column(&[0, 15, 35, 55, 75]), + offset_index_column(&[0, 10, 30, 50, 70, 90]), + ]; + + let loaded = loaded_ranges_for_projection( + Some(&selection), + &ProjectionMask::all(), + Some(&offset_index), + 100, + ); + + assert_eq!( + loaded, + Some(LoadedRowRanges::new(vec![10..15, 50..55, 90..100], 100)) + ); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors_without_selector_count_gate() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 52, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_cheap_fragmented_output_keeps_mask() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 8.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_auto_moderate_selectivity_expensive_output_keeps_mask() { + let selection = q26_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_200, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + fn q38_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(1)); + selectors.push(RowSelector::skip(12)); + } + RowSelection::from(selectors) + } + + fn q26_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(2)); + selectors.push(RowSelector::skip(10)); + } + RowSelection::from(selectors) + } + + fn sparse_test_offset_index() -> Vec { + vec![OffsetIndexMetaData { + page_locations: vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ], + unencoded_byte_array_data_bytes: None, + }] + } + + fn offset_index_column(first_rows: &[i64]) -> OffsetIndexMetaData { + OffsetIndexMetaData { + page_locations: first_rows + .iter() + .enumerate() + .map(|(idx, first_row_index)| PageLocation { + offset: (idx * 10) as i64, + compressed_page_size: 10, + first_row_index: *first_row_index, + }) + .collect(), + unencoded_byte_array_data_bytes: None, + } + } +} From 7ea81321c01fa00e22a3a6d913525b4a068b64a4 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Tue, 19 May 2026 09:41:22 +0800 Subject: [PATCH 16/32] fix(parquet): clean rustdoc link --- parquet/src/arrow/arrow_reader/selection/strategy.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs index 795b5d0fd07c..c3db635c125d 100644 --- a/parquet/src/arrow/arrow_reader/selection/strategy.rs +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Concepts used to choose how a [`RowSelection`](super::RowSelection) is executed. +//! Concepts used to choose how a [`RowSelection`] is executed. //! //! The row-filter reader makes two related but separate decisions: //! From 5b9576b9b2c218ca23da4371f36795ab757c30ae Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 20 May 2026 08:50:34 +0800 Subject: [PATCH 17/32] refactor(parquet): clarify row filter cost model --- parquet/benches/arrow_reader_row_filter.rs | 124 +++++++++++++++++- parquet/src/arrow/arrow_reader/metrics.rs | 11 ++ parquet/src/arrow/arrow_reader/mod.rs | 4 + .../arrow/arrow_reader/selection/strategy.rs | 9 +- parquet/src/arrow/push_decoder/mod.rs | 91 +++++++++++-- .../push_decoder/reader_builder/cost_model.rs | 100 ++++++++++---- .../arrow/push_decoder/reader_builder/mod.rs | 2 +- 7 files changed, 302 insertions(+), 39 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 9cdd46122c2a..0e1e3018f9e3 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -79,6 +79,7 @@ use std::ops::Range; use std::sync::Arc; const COLUMN_NAMES: [&str; 4] = ["int64", "float64", "utf8View", "ts"]; +const UTF8_VIEW_MISSING_VALUE: &str = "__arrow_rs_missing__"; /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). @@ -220,6 +221,7 @@ enum ProjectionCase { AllColumns, ExcludeFilterColumn, FilterColumnsOnly, + Float64Only, Utf8Only, } @@ -229,6 +231,7 @@ impl std::fmt::Display for ProjectionCase { ProjectionCase::AllColumns => write!(f, "all_columns"), ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), ProjectionCase::FilterColumnsOnly => write!(f, "filter_columns_only"), + ProjectionCase::Float64Only => write!(f, "float64_only"), ProjectionCase::Utf8Only => write!(f, "utf8_only"), } } @@ -381,6 +384,9 @@ enum FilterType { /// [ClickBench]: https://github.com/ClickHouse/ClickBench /// [Q21-Q27]: https://github.com/apache/datafusion/blob/b7177234e65cbbb2dcc04c252f6acd80bb026362/benchmarks/queries/clickbench/queries.sql#L22-L28 Utf8ViewNonEmpty, + /// Sparse variable-width predicate shaped like TPC-DS Q83 dynamic + /// `i_item_id` filters, where the predicate column is also projected. + Utf8ViewMissing, /// Scalar-only part of ClickBench Q37: /// /// ```sql @@ -396,6 +402,10 @@ enum FilterType { /// This synthetic predicate keeps that reader-level shape: cheap scalar /// filter columns protect an expensive `Utf8View` output column. ClickBenchQ37ScalarPrefix, + /// Scalar range predicate shaped like TPC-DS Q9 `ss_quantity BETWEEN ...` + /// subqueries. The selected rows are random and moderately selective, and + /// benchmark projections cover both count-only and numeric aggregate cases. + TpcdsQ9QuantityRange, } impl std::fmt::Display for FilterType { @@ -409,7 +419,9 @@ impl std::fmt::Display for FilterType { FilterType::UnselectiveClustered => "ts < 9000", FilterType::Composite => "float64 > 99.0 AND ts >= 9000", FilterType::Utf8ViewNonEmpty => "utf8View <> ''", + FilterType::Utf8ViewMissing => "utf8View == ''", FilterType::ClickBenchQ37ScalarPrefix => "int64 == 62 AND ts < 9000", + FilterType::TpcdsQ9QuantityRange => "int64 > 0 AND int64 < 21", }; write!(f, "{s}") } @@ -464,6 +476,11 @@ impl FilterType { let scalar = StringViewArray::new_scalar(""); neq(array, &scalar) } + FilterType::Utf8ViewMissing => { + let array = batch.column(batch.schema().index_of("utf8View")?); + let scalar = StringViewArray::new_scalar(UTF8_VIEW_MISSING_VALUE); + eq(array, &scalar) + } // ClickBenchQ37ScalarPrefix: a cheap fragmented scalar predicate // evaluated before decoding a variable-width output column. FilterType::ClickBenchQ37ScalarPrefix => { @@ -473,6 +490,12 @@ impl FilterType { let date_like_range = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; and(&counter_match, &date_like_range) } + FilterType::TpcdsQ9QuantityRange => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let lower = gt(int64, &Int64Array::new_scalar(0))?; + let upper = lt(int64, &Int64Array::new_scalar(21))?; + and(&lower, &upper) + } } } @@ -486,8 +509,9 @@ impl FilterType { FilterType::UnselectiveUnclustered => &[1], FilterType::UnselectiveClustered => &[3], FilterType::Composite => &[1, 3], // Use float64 column and ts column as representative for composite - FilterType::Utf8ViewNonEmpty => &[2], + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewMissing => &[2], FilterType::ClickBenchQ37ScalarPrefix => &[0, 3], + FilterType::TpcdsQ9QuantityRange => &[0], } } } @@ -843,10 +867,22 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { ), AsyncFocusCase::new( "profile_q19_no_defer", - parquet_file, + parquet_file.clone(), FilterType::PointLookup, ProjectionCase::FilterColumnsOnly, ), + AsyncFocusCase::new( + "profile_sparse_projected_fact_scan", + parquet_file.clone(), + FilterType::PointLookup, + ProjectionCase::AllColumns, + ), + AsyncFocusCase::new( + "profile_q83_sparse_utf8_projected", + parquet_file.clone(), + FilterType::Utf8ViewMissing, + ProjectionCase::AllColumns, + ), AsyncFocusCase::new( "profile_small_scalar_no_defer", small_parquet_file.clone(), @@ -859,6 +895,18 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { FilterType::ClickBenchQ37ScalarPrefix, ProjectionCase::Utf8Only, ), + AsyncFocusCase::new( + "profile_q9_quantity_count", + parquet_file.clone(), + FilterType::TpcdsQ9QuantityRange, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_q9_quantity_avg", + parquet_file, + FilterType::TpcdsQ9QuantityRange, + ProjectionCase::Float64Only, + ), ]; let strategies = [ AsyncStrategy::FullPostFilter, @@ -879,6 +927,48 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { } } +/// Isolate projected scans that do not construct a [`RowFilter`]. +/// +/// This tracks the reader-level shape seen in TPC-DS Q83 return-table scans: +/// a narrow primitive projection where row-level pushdown metrics are zero. +/// It deliberately lives outside the cost-model matrix because there is no +/// filter strategy to choose. +fn benchmark_projection_scan_focus(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_projection_scan_focus"); + + for (case_name, projection) in [("profile_q83_return_scan_primitives", vec![0, 1, 3])] { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let projection_mask = ProjectionMask::roots(schema_descr, projection); + + let bench_id = BenchmarkId::new(case_name, "async"); + let rt_captured = rt.handle().clone(); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + rt_captured.block_on(benchmark_async_reader_projected(reader, projection_mask)); + }); + }); + + let bench_id = BenchmarkId::new(case_name, "sync"); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + benchmark_sync_reader_projected(reader, projection_mask); + }); + }); + } +} + struct AsyncFocusCase { case_name: &'static str, parquet_file: Bytes, @@ -1004,6 +1094,7 @@ fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCa }) .collect(), ProjectionCase::FilterColumnsOnly => filter_columns.to_vec(), + ProjectionCase::Float64Only => vec![1], ProjectionCase::Utf8Only => vec![2], } } @@ -1097,6 +1188,20 @@ async fn benchmark_async_reader_post_filter( } } +async fn benchmark_async_reader_projected(reader: InMemoryReader, projection_mask: ProjectionMask) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + std::hint::black_box(batch.num_rows()); + } +} + /// Like [`benchmark_async_reader`] but also threads `with_limit(limit)` into /// the stream builder. Used by the `LIMIT` benchmark below. async fn benchmark_async_reader_with_limit( @@ -1182,6 +1287,20 @@ fn benchmark_sync_reader_post_filter( } } +fn benchmark_sync_reader_projected(reader: InMemoryReader, projection_mask: ProjectionMask) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); + + for b in stream { + let batch = b.unwrap(); + std::hint::black_box(batch.num_rows()); + } +} + /// Adapter to read asynchronously from in memory bytes and always loads the /// metadata with page indexes. #[derive(Debug, Clone)] @@ -1304,6 +1423,7 @@ criterion_group!( benchmark_sync_strategy_matrix, benchmark_async_strategy_matrix, benchmark_async_cost_model_focus, + benchmark_projection_scan_focus, benchmark_filters_with_limit, ); criterion_main!(benches); diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index ccc57f7b0f9d..cb17b9a77c14 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -275,6 +275,11 @@ impl ArrowReaderMetrics { self.load(|inner| &inner.cost_model_high_selectivity_no_pruning_count) } + /// Cost model: number of projected-predicate moderate-selectivity triggers + pub fn cost_model_projected_predicate_moderate_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_projected_predicate_moderate_selectivity_count) + } + /// Cost model: number of fragmented moderate-selectivity triggers pub fn cost_model_fragmented_moderate_selectivity_count(&self) -> Option { self.load(|inner| &inner.cost_model_fragmented_moderate_selectivity_count) @@ -392,6 +397,9 @@ impl ArrowReaderMetrics { CostModelDecisionReason::HighSelectivityNoPruning => { &inner.cost_model_high_selectivity_no_pruning_count } + CostModelDecisionReason::ProjectedPredicateModerateSelectivity => { + &inner.cost_model_projected_predicate_moderate_selectivity_count + } CostModelDecisionReason::FragmentedModerateSelectivity => { &inner.cost_model_fragmented_moderate_selectivity_count } @@ -510,6 +518,8 @@ pub struct ArrowReaderMetricsInner { cost_model_pushdown_still_preferred_count: AtomicUsize, /// Number of high-selectivity no-pruning cost-model triggers cost_model_high_selectivity_no_pruning_count: AtomicUsize, + /// Number of projected-predicate moderate-selectivity cost-model triggers + cost_model_projected_predicate_moderate_selectivity_count: AtomicUsize, /// Number of fragmented moderate-selectivity cost-model triggers cost_model_fragmented_moderate_selectivity_count: AtomicUsize, /// Number of fragmented high-selectivity cost-model triggers @@ -546,6 +556,7 @@ impl ArrowReaderMetricsInner { cost_model_observation_incomplete_count: AtomicUsize::new(0), cost_model_pushdown_still_preferred_count: AtomicUsize::new(0), cost_model_high_selectivity_no_pruning_count: AtomicUsize::new(0), + cost_model_projected_predicate_moderate_selectivity_count: AtomicUsize::new(0), cost_model_fragmented_moderate_selectivity_count: AtomicUsize::new(0), cost_model_fragmented_high_selectivity_count: AtomicUsize::new(0), phase_profile_enabled, diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c02789be6fa0..95801d723bcd 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1397,6 +1397,10 @@ impl ParquetRecordBatchReader { return Ok(buffered_batches.pop_front()); } + if self.post_filter.is_none() && self.post_selection_filter.is_none() { + return self.next_inner_decoded(); + } + loop { let Some(batch) = self.next_inner_decoded()? else { return Ok(None); diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs index c3db635c125d..ccd9a81b9cb5 100644 --- a/parquet/src/arrow/arrow_reader/selection/strategy.rs +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -162,6 +162,10 @@ impl RowSelectionShape { pub(crate) enum CostModelDecisionReason { /// Predicate pushdown kept almost everything and did not produce useful pruning. HighSelectivityNoPruning, + /// Predicate columns are already part of the output projection, and the + /// observed selected-row ratio is high enough that sequential post-filtering + /// is likely cheaper than many selected output reads. + ProjectedPredicateModerateSelectivity, /// Fragmented runs with moderate selectivity often pay many small skip/read costs. FragmentedModerateSelectivity, /// Fragmented runs with high selectivity usually decode most rows plus pay pushdown overhead. @@ -191,7 +195,7 @@ pub(crate) struct CostModelObservation { impl CostModelObservation { pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; - const FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; + pub(crate) const MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { @@ -210,7 +214,7 @@ impl CostModelObservation { } let selected_ratio = shape.selected_ratio(); - if (Self::FRAGMENTED_MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { + if (Self::MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { return CostModelDecisionReason::FragmentedModerateSelectivity; } if selected_ratio < 0.50 { @@ -224,6 +228,7 @@ impl CostModelObservation { matches!( self.trigger_reason(), CostModelDecisionReason::HighSelectivityNoPruning + | CostModelDecisionReason::ProjectedPredicateModerateSelectivity | CostModelDecisionReason::FragmentedModerateSelectivity | CostModelDecisionReason::FragmentedHighSelectivity ) diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 7c7f2747b75a..a3b5b85e0d9f 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1361,19 +1361,20 @@ mod test { } #[test] - fn test_decoder_auto_cost_model_current_row_uses_predicate_cache() { + fn test_decoder_auto_cost_model_switches_for_projected_predicate_after_observation() { let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); - let metrics = ArrowReaderMetrics::enabled(); + let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); let row_filter_a = ArrowPredicateFn::new( ProjectionMask::columns(&schema_descr, ["a"]), move |batch: RecordBatch| { - let scalar_neg_one = Int64Array::new_scalar(-1); - let column = batch.column(0).as_primitive::(); - gt(column, &scalar_neg_one) + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(first_rows_per_hundred_filter(&batch, 20)) }, ); @@ -1386,12 +1387,66 @@ mod test { .build() .unwrap(); - let batch = next_batch_with_data(&mut decoder, data).unwrap(); - assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[0, 2]).unwrap()); + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); - assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(1) + ); assert_eq!(metrics.records_read_from_cache(), Some(100)); + + let report = metrics.phase_profile_report().unwrap(); + assert_eq!( + phase_profile_count(&report, "post_selection_apply_filter"), + 1 + ); + assert_eq!(phase_profile_count(&report, "post_filter_apply_filter"), 3); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_sparse_projected_predicate() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 5)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 5) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); } #[test] @@ -2256,6 +2311,26 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn first_rows_per_hundred_filter(batch: &RecordBatch, rows_per_hundred: i64) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 100 < rows_per_hundred) + .collect::>(), + ) + } + + fn expected_a_c_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0, 2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn expected_c_every_other(offset: usize, len: usize) -> RecordBatch { let batch = TEST_BATCH.slice(offset, len); let filter = BooleanArray::from((0..len).map(|idx| idx % 2 == 0).collect::>()); diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 192e94423c0f..3aedaa96ea88 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -20,11 +20,15 @@ //! The cost model is intentionally adaptive rather than purely static. There //! are two ways to enter post-filter execution: //! -//! * a narrow static rule starts there for variable-width predicate columns, -//! where building fragmented pushdown selections is commonly expensive +//! * a narrow static rule starts there for variable-width predicate columns +//! that are not already part of the output projection, where building +//! fragmented pushdown selections is commonly expensive //! * the first eligible row group runs predicate pushdown, records the actual -//! `RowSelection` shape, and lets later row groups use post-filter if that -//! shape suggests pushdown is doing extra work without pruning enough rows +//! `RowSelection` shape, and lets later row groups use post-filter if the +//! shape suggests pushdown is doing extra work without pruning enough rows. +//! When predicate columns are already part of the output projection, the +//! observed selected-row ratio can also choose post-filter without requiring +//! fragmented selected runs. //! //! ```text //! Start @@ -108,7 +112,7 @@ impl RowGroupReaderBuilder { self.build_post_filter_read_projection(filter) } - pub(super) fn should_start_with_post_filter_for_variable_width_predicate( + pub(super) fn should_start_with_post_filter_for_unprojected_variable_width_predicate( &self, filter: &RowFilter, row_group_idx: usize, @@ -122,7 +126,11 @@ impl RowGroupReaderBuilder { return false; }; - self.projection_has_variable_width_leaf(row_group_idx, &predicate_projection) + let predicate_already_projected = + self.projection_includes_all(&self.projection, &predicate_projection); + + !predicate_already_projected + && self.projection_has_variable_width_leaf(row_group_idx, &predicate_projection) } fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { @@ -166,6 +174,12 @@ impl RowGroupReaderBuilder { }) } + fn projection_includes_all(&self, projection: &ProjectionMask, other: &ProjectionMask) -> bool { + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()) + .all(|leaf_idx| !other.leaf_included(leaf_idx) || projection.leaf_included(leaf_idx)) + } + pub(super) fn observe_cost_model_candidate( &mut self, decision: RowSelectionStrategyDecision, @@ -176,35 +190,43 @@ impl RowGroupReaderBuilder { return; } - let RowGroupCostModelState::Observing { observation } = &mut self.cost_model_state else { - return; - }; - - let mut shape = decision.shape; - if shape.total_rows() == 0 { - // `None` selection means the predicate kept the whole row group. - // Represent it as one selected run so the cost model can - // treat "no pruning" as an observed high-selectivity case. - shape = RowSelectionShape { - selected_rows: row_count, - skipped_rows: 0, - selector_count: 1, - selected_run_count: 1, - skipped_run_count: 0, + let observation = { + let RowGroupCostModelState::Observing { observation } = &mut self.cost_model_state + else { + return; }; - } - observation.observed_row_groups += 1; - observation.shape.add_assign(shape); + let mut shape = decision.shape; + if shape.total_rows() == 0 { + // `None` selection means the predicate kept the whole row group. + // Represent it as one selected run so the cost model can + // treat "no pruning" as an observed high-selectivity case. + shape = RowSelectionShape { + selected_rows: row_count, + skipped_rows: 0, + selector_count: 1, + selected_run_count: 1, + skipped_run_count: 0, + }; + } + + observation.observed_row_groups += 1; + observation.shape.add_assign(shape); + *observation + }; self.metrics.record_cost_model_observed_row_group(); - let reason = observation.trigger_reason(); + let reason = self.cost_model_reason_with_projection_context(observation); if matches!(reason, CostModelDecisionReason::ObservationIncomplete) { self.metrics.record_cost_model_trigger(reason); return; } - let prefers_post_filter = observation.prefers_post_filter(); + let prefers_post_filter = observation.prefers_post_filter() + || matches!( + reason, + CostModelDecisionReason::ProjectedPredicateModerateSelectivity + ); self.metrics.record_cost_model_trigger(reason); if prefers_post_filter && self.post_filter_cost_model_supported(budget) { @@ -214,6 +236,32 @@ impl RowGroupReaderBuilder { } } + fn cost_model_reason_with_projection_context( + &self, + observation: CostModelObservation, + ) -> CostModelDecisionReason { + let reason = observation.trigger_reason(); + if !matches!(reason, CostModelDecisionReason::PushdownStillPreferred) { + return reason; + } + + let Some(filter) = self.filter.as_ref() else { + return reason; + }; + let Some(predicate_projection) = filter.union_projection() else { + return reason; + }; + + let selected_ratio = observation.shape.selected_ratio(); + if self.projection_includes_all(&self.projection, &predicate_projection) + && selected_ratio >= CostModelObservation::MODERATE_SELECTIVITY_MIN_RATIO + { + CostModelDecisionReason::ProjectedPredicateModerateSelectivity + } else { + reason + } + } + pub(super) fn post_filter_cost_model_supported(&self, budget: RowBudget) -> bool { let Some(filter) = self.filter.as_ref() else { return false; diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 6aec47daf9d0..2edc9b777311 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -559,7 +559,7 @@ impl RowGroupReaderBuilder { })); }; - if self.should_start_with_post_filter_for_variable_width_predicate( + if self.should_start_with_post_filter_for_unprojected_variable_width_predicate( &filter, row_group_info.row_group_idx, row_group_info.budget, From bfee76e9f417e6ca9210a3ea617dd174a2e6fd18 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 20 May 2026 08:58:39 +0800 Subject: [PATCH 18/32] fix(parquet): keep cost model test feature neutral --- parquet/src/arrow/push_decoder/mod.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index a3b5b85e0d9f..518e1895b2f0 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1366,7 +1366,7 @@ mod test { let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); - let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + let metrics = ArrowReaderMetrics::enabled(); let predicate_rows = Arc::new(AtomicUsize::new(0)); let predicate_rows_for_filter = Arc::clone(&predicate_rows); @@ -1404,13 +1404,6 @@ mod test { Some(1) ); assert_eq!(metrics.records_read_from_cache(), Some(100)); - - let report = metrics.phase_profile_report().unwrap(); - assert_eq!( - phase_profile_count(&report, "post_selection_apply_filter"), - 1 - ); - assert_eq!(phase_profile_count(&report, "post_filter_apply_filter"), 3); } #[test] From bd48c95ac0b5fa200fc35a97ff4164ad369beb8d Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 20 May 2026 09:05:51 +0800 Subject: [PATCH 19/32] fix(parquet): satisfy row filter bench clippy --- parquet/benches/arrow_reader_row_filter.rs | 46 +++++++++++----------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 0e1e3018f9e3..9b91689d0145 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -942,31 +942,31 @@ fn benchmark_projection_scan_focus(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_reader_projection_scan_focus"); - for (case_name, projection) in [("profile_q83_return_scan_primitives", vec![0, 1, 3])] { - let reader = InMemoryReader::try_new(&parquet_file).unwrap(); - let metadata = Arc::clone(reader.metadata()); - let schema_descr = metadata.file_metadata().schema_descr(); - let projection_mask = ProjectionMask::roots(schema_descr, projection); - - let bench_id = BenchmarkId::new(case_name, "async"); - let rt_captured = rt.handle().clone(); - group.bench_function(bench_id, |b| { - b.iter(|| { - let reader = reader.clone(); - let projection_mask = projection_mask.clone(); - rt_captured.block_on(benchmark_async_reader_projected(reader, projection_mask)); - }); + let case_name = "profile_q83_return_scan_primitives"; + let projection = vec![0, 1, 3]; + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let projection_mask = ProjectionMask::roots(schema_descr, projection); + + let bench_id = BenchmarkId::new(case_name, "async"); + let rt_captured = rt.handle().clone(); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + rt_captured.block_on(benchmark_async_reader_projected(reader, projection_mask)); }); - - let bench_id = BenchmarkId::new(case_name, "sync"); - group.bench_function(bench_id, |b| { - b.iter(|| { - let reader = reader.clone(); - let projection_mask = projection_mask.clone(); - benchmark_sync_reader_projected(reader, projection_mask); - }); + }); + + let bench_id = BenchmarkId::new(case_name, "sync"); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + benchmark_sync_reader_projected(reader, projection_mask); }); - } + }); } struct AsyncFocusCase { From 7c5fde8f96dad246e61173f9fae2f59dff7b5fed Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 20 May 2026 09:21:26 +0800 Subject: [PATCH 20/32] fix(parquet): keep sparse projected filters on pushdown --- parquet/src/arrow/arrow_reader/selection/strategy.rs | 1 + parquet/src/arrow/push_decoder/reader_builder/cost_model.rs | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs index ccd9a81b9cb5..032f9568f165 100644 --- a/parquet/src/arrow/arrow_reader/selection/strategy.rs +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -196,6 +196,7 @@ pub(crate) struct CostModelObservation { impl CostModelObservation { pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; pub(crate) const MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; + pub(crate) const PROJECTED_PREDICATE_MIN_RATIO: f64 = 0.15; pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 3aedaa96ea88..70f53d948bc1 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -253,8 +253,11 @@ impl RowGroupReaderBuilder { }; let selected_ratio = observation.shape.selected_ratio(); + // Projected predicates can reuse decoded predicate values, but sparse + // filters can still win with page pruning. Use a higher bar than the + // fragmented-run threshold before switching this case to post-filter. if self.projection_includes_all(&self.projection, &predicate_projection) - && selected_ratio >= CostModelObservation::MODERATE_SELECTIVITY_MIN_RATIO + && selected_ratio >= CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO { CostModelDecisionReason::ProjectedPredicateModerateSelectivity } else { From 75f0d9f0ae57d67e75454201a03cf6b3eacb63b2 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 20 May 2026 11:50:45 +0800 Subject: [PATCH 21/32] Limit projected predicate cost model switch --- .../arrow/arrow_reader/selection/strategy.rs | 1 + parquet/src/arrow/push_decoder/mod.rs | 36 +++++++++++++++++++ .../push_decoder/reader_builder/cost_model.rs | 8 +++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs index 032f9568f165..08b49382c7bf 100644 --- a/parquet/src/arrow/arrow_reader/selection/strategy.rs +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -197,6 +197,7 @@ impl CostModelObservation { pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; pub(crate) const MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; pub(crate) const PROJECTED_PREDICATE_MIN_RATIO: f64 = 0.15; + pub(crate) const PROJECTED_PREDICATE_MAX_RATIO: f64 = 0.50; pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 518e1895b2f0..0f209ee07776 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1442,6 +1442,42 @@ mod test { assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); } + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_high_selectivity_projected_predicate() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 90)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 90) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + } + #[test] fn test_decoder_auto_cost_model_with_row_selection_does_not_evaluate_current_row_group_twice() { let data = &COST_MODEL_TEST_FILE_DATA; diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 70f53d948bc1..2f69b4417a71 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -254,10 +254,12 @@ impl RowGroupReaderBuilder { let selected_ratio = observation.shape.selected_ratio(); // Projected predicates can reuse decoded predicate values, but sparse - // filters can still win with page pruning. Use a higher bar than the - // fragmented-run threshold before switching this case to post-filter. + // or clustered filters can still win with page pruning. Keep this + // shortcut to moderate selectivity before switching to post-filter. if self.projection_includes_all(&self.projection, &predicate_projection) - && selected_ratio >= CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO + && (CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO + ..CostModelObservation::PROJECTED_PREDICATE_MAX_RATIO) + .contains(&selected_ratio) { CostModelDecisionReason::ProjectedPredicateModerateSelectivity } else { From eab1642618e378c37c3d5bf7fc2d142c7887b917 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Thu, 21 May 2026 15:02:55 +0800 Subject: [PATCH 22/32] Support post-filter for whole nested projections --- parquet/benches/arrow_reader_row_filter.rs | 194 +++++++++++++++++- parquet/src/arrow/arrow_reader/post_filter.rs | 65 ++++-- parquet/src/arrow/mod.rs | 47 +++++ parquet/src/arrow/push_decoder/mod.rs | 137 ++++++++++++- .../push_decoder/reader_builder/cost_model.rs | 28 ++- 5 files changed, 437 insertions(+), 34 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 9b91689d0145..029ccd3ae67e 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -52,7 +52,9 @@ //! - unsel_clustered: for Unselective Clustered – in each 10K-row block, rows with an offset >= 1000 are "unsel_clustered". //! -use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, TimestampMillisecondArray}; +use arrow::array::{ + ArrayRef, BooleanArray, Float64Array, Int64Array, StructArray, TimestampMillisecondArray, +}; use arrow::compute::and; use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; @@ -200,6 +202,10 @@ fn write_parquet_file() -> Vec { /// returning the buffer. fn write_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec { let batch = create_record_batch(total_rows); + write_record_batch_to_parquet(&batch, row_group_size) +} + +fn write_record_batch_to_parquet(batch: &RecordBatch, row_group_size: usize) -> Vec { let schema = batch.schema(); let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) @@ -214,6 +220,37 @@ fn write_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec buffer } +fn create_nested_record_batch(size: usize) -> RecordBatch { + let tag = Arc::new(StringViewArray::from_iter_values( + (0..size).map(|idx| format!("tag_{}", idx % 7)), + )) as ArrayRef; + let payload = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Int64Array::from_iter_values( + (0..size).map(|idx| idx as i64 + 1_000), + )) as ArrayRef, + ), + ( + Arc::new(Field::new("label", DataType::Utf8View, false)), + Arc::new(StringViewArray::from_iter_values( + (0..size).map(|idx| format!("payload_{idx}")), + )) as ArrayRef, + ), + ]); + let payload = Arc::new(payload) as ArrayRef; + let value = Arc::new(Int64Array::from_iter_values( + (0..size).map(|idx| idx as i64 + 10_000), + )) as ArrayRef; + + RecordBatch::try_from_iter(vec![("tag", tag), ("payload", payload), ("value", value)]).unwrap() +} + +fn write_nested_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_nested_record_batch(total_rows); + write_record_batch_to_parquet(&batch, row_group_size) +} + /// ProjectionCase defines the projection mode for the benchmark: /// either projecting all columns or excluding the column that is used for filtering. #[derive(Clone, Copy)] @@ -1122,6 +1159,39 @@ fn row_filter_for(filter_type: FilterType, pred_mask: ProjectionMask) -> RowFilt RowFilter::new(vec![Box::new(filter)]) } +#[derive(Clone, Copy)] +enum NestedFilterType { + AlwaysTrueTag, + TagNotZero, +} + +impl std::fmt::Display for NestedFilterType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AlwaysTrueTag => write!(f, "always_true_tag"), + Self::TagNotZero => write!(f, "tag_not_zero"), + } + } +} + +impl NestedFilterType { + fn filter_batch(self, batch: &RecordBatch) -> arrow::error::Result { + match self { + Self::AlwaysTrueTag => Ok(BooleanArray::from(vec![true; batch.num_rows()])), + Self::TagNotZero => { + let tag = batch.column(batch.schema().index_of("tag")?); + let scalar = StringViewArray::new_scalar("tag_0"); + neq(tag, &scalar) + } + } + } +} + +fn nested_row_filter_for(filter_type: NestedFilterType, pred_mask: ProjectionMask) -> RowFilter { + let filter = ArrowPredicateFn::new(pred_mask, move |batch| filter_type.filter_batch(&batch)); + RowFilter::new(vec![Box::new(filter)]) +} + /// Use async API async fn benchmark_async_reader( reader: InMemoryReader, @@ -1188,6 +1258,33 @@ async fn benchmark_async_reader_post_filter( } } +async fn benchmark_async_reader_post_filter_nested( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: &[&str], + filter_type: NestedFilterType, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + async fn benchmark_async_reader_projected(reader: InMemoryReader, projection_mask: ProjectionMask) { let mut stream = ParquetRecordBatchStreamBuilder::new(reader) .await @@ -1417,6 +1514,100 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { } } +/// Focused nested-output case for post-filter cost modeling. +/// +/// The predicate column is an unprojected variable-width scalar column, and the +/// output is a whole nested `Struct` root. This isolates the reader case enabled +/// by root-aware post-filter projection without requiring recursive nested-child +/// projection. +fn benchmark_async_nested_post_filter_focus(c: &mut Criterion) { + let parquet_file = Bytes::from(write_nested_parquet_file_with_rows( + TOTAL_ROWS, + ROW_GROUP_SIZE, + )); + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoCostModel, + AsyncStrategy::PushdownMask, + AsyncStrategy::PushdownSelectors, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_nested_post_filter_focus"); + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = ProjectionMask::columns(schema_descr, ["payload"]); + let read_projection = ProjectionMask::columns(schema_descr, ["tag", "payload"]); + let pred_mask = ProjectionMask::columns(schema_descr, ["tag"]); + let filter_cases = [ + NestedFilterType::AlwaysTrueTag, + NestedFilterType::TagNotZero, + ]; + + for filter_case in filter_cases { + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("whole_struct_output/{filter_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let output_projection = output_projection.clone(); + let read_projection = read_projection.clone(); + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter_nested( + reader, + read_projection, + &["payload"], + filter_case, + ) + .await + } + AsyncStrategy::PushdownAutoCostModel => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } + } +} + criterion_group!( benches, benchmark_filters_and_projections, @@ -1425,5 +1616,6 @@ criterion_group!( benchmark_async_cost_model_focus, benchmark_projection_scan_focus, benchmark_filters_with_limit, + benchmark_async_nested_post_filter_focus, ); criterion_main!(benches); diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs index d7d02774e05c..096436fcb349 100644 --- a/parquet/src/arrow/arrow_reader/post_filter.rs +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -242,36 +242,59 @@ fn projection_indices( read_projection: &ProjectionMask, target_projection: &ProjectionMask, ) -> Result> { - // Convert parquet leaf positions to RecordBatch column positions after the - // larger read projection has been decoded. For example: + validate_post_filter_projection(parquet_schema, read_projection, target_projection)?; + + // Convert parquet projection masks to top-level RecordBatch column + // positions after the larger read projection has been decoded. For example: // // ```text - // parquet leaves: a b c d - // read projection: a c d => batch columns [a, c, d] - // target: c => target index [1] + // parquet leaves: a b.aa b.bb c + // read projection: a b.aa b.bb => batch columns [a, b] + // target: b.aa b.bb => target index [1] // ``` - let mut indices = Vec::new(); - let mut read_idx = 0; + let read_roots = read_projection.included_root_column_indices(parquet_schema); + target_projection + .included_root_column_indices(parquet_schema) + .into_iter() + .map(|target_root| { + read_roots + .iter() + .position(|read_root| *read_root == target_root) + .ok_or_else(|| { + general_err!( + "post-filter target root column {target_root} not present in read projection" + ) + }) + }) + .collect() +} + +fn validate_post_filter_projection( + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + target_projection: &ProjectionMask, +) -> Result<()> { + // Post-filter only projects already-decoded batches by top-level Arrow + // field index. It can keep or drop a whole nested root, but it cannot + // recursively project nested children such as `b.aa` without `b.bb`. + if !read_projection.selects_whole_root_columns(parquet_schema) { + return Err(general_err!( + "post-filter cost model does not support partial nested read projections" + )); + } + if !target_projection.selects_whole_root_columns(parquet_schema) { + return Err(general_err!( + "post-filter cost model does not support partial nested target projections" + )); + } for leaf_idx in 0..parquet_schema.num_columns() { - if read_projection.leaf_included(leaf_idx) { - let root = parquet_schema.get_column_root(leaf_idx); - if !root.is_primitive() { - return Err(general_err!( - "post-filter cost model does not support nested read column {}", - root.name() - )); - } - if target_projection.leaf_included(leaf_idx) { - indices.push(read_idx); - } - read_idx += 1; - } else if target_projection.leaf_included(leaf_idx) { + if target_projection.leaf_included(leaf_idx) && !read_projection.leaf_included(leaf_idx) { return Err(general_err!( "post-filter target projection includes leaf column {leaf_idx} not present in read projection" )); } } - Ok(indices) + Ok(()) } diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 52152988166f..2c9d58fd96b9 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -381,6 +381,53 @@ impl ProjectionMask { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) } + /// Returns top-level root column indices that have at least one included + /// leaf, preserving their physical order in the parquet schema. + pub(crate) fn included_root_column_indices(&self, schema: &SchemaDescriptor) -> Vec { + let num_roots = schema.root_schema().get_fields().len(); + let mut seen = vec![false; num_roots]; + let mut roots = Vec::new(); + + for leaf_idx in 0..schema.num_columns() { + if !self.leaf_included(leaf_idx) { + continue; + } + let root_idx = schema.get_column_root_idx(leaf_idx); + if !seen[root_idx] { + seen[root_idx] = true; + roots.push(root_idx); + } + } + + roots + } + + /// Returns true if each top-level root column is either fully selected or + /// fully skipped. + /// + /// This is useful for code paths that project decoded [`RecordBatch`] + /// values by top-level Arrow field index. A full `struct` root can be moved + /// as one batch column, but selecting only `struct.child` would require + /// recursively trimming the nested array. + pub(crate) fn selects_whole_root_columns(&self, schema: &SchemaDescriptor) -> bool { + let num_roots = schema.root_schema().get_fields().len(); + let mut root_leaf_counts = vec![0usize; num_roots]; + let mut included_leaf_counts = vec![0usize; num_roots]; + + for leaf_idx in 0..schema.num_columns() { + let root_idx = schema.get_column_root_idx(leaf_idx); + root_leaf_counts[root_idx] += 1; + if self.leaf_included(leaf_idx) { + included_leaf_counts[root_idx] += 1; + } + } + + included_leaf_counts + .into_iter() + .zip(root_leaf_counts) + .all(|(included, total)| included == 0 || included == total) + } + /// Union two projection masks /// /// Example: diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 0f209ee07776..33f655540287 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -646,9 +646,12 @@ mod test { use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int64Type; - use arrow_array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray}; + use arrow_array::{ + ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray, StructArray, + }; #[cfg(feature = "async")] - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::Schema; + use arrow_schema::{DataType, Field}; use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; use bytes::Bytes; @@ -1244,6 +1247,97 @@ mod test { assert_eq!(phase_profile_count(&report, "output_selection_resolve"), 0); } + #[test] + fn test_decoder_post_filter_supports_whole_nested_output_projection() { + let data = &NESTED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_tag = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["tag"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["payload"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_tag)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + NESTED_TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[1]) + .unwrap() + ); + } + + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_post_filter_keeps_partial_nested_predicate_on_pushdown() { + let data = &NESTED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_payload_label = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["payload.label"]), + move |batch: RecordBatch| { + let payload = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(payload.num_columns(), 1); + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["payload"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_payload_label)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + NESTED_TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[1]) + .unwrap() + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + #[test] fn test_decoder_auto_cost_model_post_filter_applies_fragmented_filter() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -2096,8 +2190,45 @@ mod test { static COST_MODEL_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); + static NESTED_TEST_BATCH: LazyLock = LazyLock::new(|| { + let tag: ArrayRef = Arc::new(StringViewArray::from_iter_values( + (0..400).map(|idx| format!("tag_{}", idx % 7)), + )); + let payload = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Int64Array::from_iter_values(1_000..1_400)) as ArrayRef, + ), + ( + Arc::new(Field::new("label", DataType::Utf8View, false)), + Arc::new(StringViewArray::from_iter_values( + (0..400).map(|idx| format!("payload_{idx}")), + )) as ArrayRef, + ), + ]); + let payload: ArrayRef = Arc::new(payload); + let value: ArrayRef = Arc::new(Int64Array::from_iter_values(10_000..10_400)); + + RecordBatch::try_from_iter(vec![("tag", tag), ("payload", payload), ("value", value)]) + .unwrap() + }); + + static NESTED_COST_MODEL_TEST_FILE_DATA: LazyLock = + LazyLock::new(|| write_batch_test_file(&NESTED_TEST_BATCH, 100, 50)); + fn write_test_file(max_row_group_row_count: usize, data_page_row_count_limit: usize) -> Bytes { - let input_batch = &TEST_BATCH; + write_batch_test_file( + &TEST_BATCH, + max_row_group_row_count, + data_page_row_count_limit, + ) + } + + fn write_batch_test_file( + input_batch: &RecordBatch, + max_row_group_row_count: usize, + data_page_row_count_limit: usize, + ) -> Bytes { let mut output = Vec::new(); let writer_options = WriterProperties::builder() diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 2f69b4417a71..06ddcf398c00 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -144,22 +144,23 @@ impl RowGroupReaderBuilder { let mut read_projection = self.projection.clone(); read_projection.union(&filter.union_projection()?); - if self.post_filter_supports_projection(&read_projection) { + if self.post_filter_supports_batch_projection(&read_projection) { Some(read_projection) } else { None } } - fn post_filter_supports_projection(&self, projection: &ProjectionMask) -> bool { - // The post-filter reader currently projects record batches by parquet - // leaf column position. Nested roots can span multiple leaves and need - // the existing array-reader projection machinery, so allow the - // post-filter cost path for primitive roots only. + fn post_filter_supports_batch_projection(&self, projection: &ProjectionMask) -> bool { + // Post-filter projects decoded record batches by top-level Arrow field + // index. A nested root is safe when it is selected as a whole root: + // the decoded batch then contains exactly one top-level field for that + // root and can be projected without recursively trimming children. + // + // Partial nested projections, such as `struct.a` without `struct.b`, + // still need recursive array projection and remain on the pushdown path. let schema = self.metadata.file_metadata().schema_descr(); - (0..schema.num_columns()).all(|leaf_idx| { - !projection.leaf_included(leaf_idx) || schema.get_column_root(leaf_idx).is_primitive() - }) + projection.selects_whole_root_columns(schema) } fn projection_has_variable_width_leaf( @@ -279,6 +280,15 @@ impl RowGroupReaderBuilder { && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && budget.is_unbounded() && !self.has_virtual_columns() + && self.post_filter_supports_batch_projection(&self.projection) + // The combined read projection may be whole-root even when an + // individual predicate asks for one nested child that is completed + // by the output projection. Check every batch projection that + // `PostFilterState` will materialize, not only their union. + && filter + .predicates() + .iter() + .all(|predicate| self.post_filter_supports_batch_projection(predicate.projection())) && self.build_post_filter_read_projection(filter).is_some() } From 269a84e96d1c73a8bef905eceb687e797b4b97d8 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Thu, 21 May 2026 15:37:49 +0800 Subject: [PATCH 23/32] Refine post-filter root projection planning --- parquet/src/arrow/arrow_reader/post_filter.rs | 41 +++++---- parquet/src/arrow/mod.rs | 87 ++++++++++++++----- 2 files changed, 90 insertions(+), 38 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs index 096436fcb349..59c81dc56ffb 100644 --- a/parquet/src/arrow/arrow_reader/post_filter.rs +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -38,9 +38,9 @@ //! This is profitable for shapes where row-level pushdown has high overhead //! and little pruning, especially fragmented high-selectivity selections. -use crate::arrow::ProjectionMask; use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; use crate::arrow::arrow_reader::{RowFilter, RowSelection}; +use crate::arrow::{ProjectionMask, RootColumnSelection}; use crate::errors::{ParquetError, Result}; use crate::schema::types::SchemaDescriptor; use arrow_array::{BooleanArray, RecordBatch}; @@ -242,7 +242,15 @@ fn projection_indices( read_projection: &ProjectionMask, target_projection: &ProjectionMask, ) -> Result> { - validate_post_filter_projection(parquet_schema, read_projection, target_projection)?; + let read_roots = read_projection.root_column_selection(parquet_schema); + let target_roots = target_projection.root_column_selection(parquet_schema); + validate_post_filter_projection( + parquet_schema, + read_projection, + target_projection, + &read_roots, + &target_roots, + )?; // Convert parquet projection masks to top-level RecordBatch column // positions after the larger read projection has been decoded. For example: @@ -252,19 +260,20 @@ fn projection_indices( // read projection: a b.aa b.bb => batch columns [a, b] // target: b.aa b.bb => target index [1] // ``` - let read_roots = read_projection.included_root_column_indices(parquet_schema); - target_projection - .included_root_column_indices(parquet_schema) + let mut read_root_to_batch_idx = vec![None; parquet_schema.root_schema().get_fields().len()]; + for (batch_idx, root_idx) in read_roots.included_indices.iter().copied().enumerate() { + read_root_to_batch_idx[root_idx] = Some(batch_idx); + } + + target_roots + .included_indices .into_iter() .map(|target_root| { - read_roots - .iter() - .position(|read_root| *read_root == target_root) - .ok_or_else(|| { - general_err!( - "post-filter target root column {target_root} not present in read projection" - ) - }) + read_root_to_batch_idx[target_root].ok_or_else(|| { + general_err!( + "post-filter target root column {target_root} not present in read projection" + ) + }) }) .collect() } @@ -273,16 +282,18 @@ fn validate_post_filter_projection( parquet_schema: &SchemaDescriptor, read_projection: &ProjectionMask, target_projection: &ProjectionMask, + read_roots: &RootColumnSelection, + target_roots: &RootColumnSelection, ) -> Result<()> { // Post-filter only projects already-decoded batches by top-level Arrow // field index. It can keep or drop a whole nested root, but it cannot // recursively project nested children such as `b.aa` without `b.bb`. - if !read_projection.selects_whole_root_columns(parquet_schema) { + if !read_roots.selects_whole_roots { return Err(general_err!( "post-filter cost model does not support partial nested read projections" )); } - if !target_projection.selects_whole_root_columns(parquet_schema) { + if !target_roots.selects_whole_roots { return Err(general_err!( "post-filter cost model does not support partial nested target projections" )); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 2c9d58fd96b9..b940cbc6cdc6 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -275,6 +275,14 @@ pub struct ProjectionMask { mask: Option>, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct RootColumnSelection { + /// Top-level root column indices with at least one selected leaf. + pub(crate) included_indices: Vec, + /// True when every top-level root is either entirely selected or skipped. + pub(crate) selects_whole_roots: bool, +} + impl ProjectionMask { /// Create a [`ProjectionMask`] which selects all columns pub fn all() -> Self { @@ -381,27 +389,6 @@ impl ProjectionMask { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) } - /// Returns top-level root column indices that have at least one included - /// leaf, preserving their physical order in the parquet schema. - pub(crate) fn included_root_column_indices(&self, schema: &SchemaDescriptor) -> Vec { - let num_roots = schema.root_schema().get_fields().len(); - let mut seen = vec![false; num_roots]; - let mut roots = Vec::new(); - - for leaf_idx in 0..schema.num_columns() { - if !self.leaf_included(leaf_idx) { - continue; - } - let root_idx = schema.get_column_root_idx(leaf_idx); - if !seen[root_idx] { - seen[root_idx] = true; - roots.push(root_idx); - } - } - - roots - } - /// Returns true if each top-level root column is either fully selected or /// fully skipped. /// @@ -410,22 +397,42 @@ impl ProjectionMask { /// as one batch column, but selecting only `struct.child` would require /// recursively trimming the nested array. pub(crate) fn selects_whole_root_columns(&self, schema: &SchemaDescriptor) -> bool { + self.root_column_selection(schema).selects_whole_roots + } + + /// Summarizes this leaf mask at top-level parquet root-column granularity. + /// + /// This intentionally combines the included-root list and whole-root check + /// in one leaf scan. Post-filter planning needs both values when converting + /// parquet projection masks to decoded [`RecordBatch`] column indices. + pub(crate) fn root_column_selection(&self, schema: &SchemaDescriptor) -> RootColumnSelection { let num_roots = schema.root_schema().get_fields().len(); let mut root_leaf_counts = vec![0usize; num_roots]; let mut included_leaf_counts = vec![0usize; num_roots]; + let mut included_root_seen = vec![false; num_roots]; + let mut included_indices = Vec::new(); for leaf_idx in 0..schema.num_columns() { let root_idx = schema.get_column_root_idx(leaf_idx); root_leaf_counts[root_idx] += 1; if self.leaf_included(leaf_idx) { included_leaf_counts[root_idx] += 1; + if !included_root_seen[root_idx] { + included_root_seen[root_idx] = true; + included_indices.push(root_idx); + } } } - included_leaf_counts + let selects_whole_roots = included_leaf_counts .into_iter() .zip(root_leaf_counts) - .all(|(included, total)| included == 0 || included == total) + .all(|(included, total)| included == 0 || included == total); + + RootColumnSelection { + included_indices, + selects_whole_roots, + } } /// Union two projection masks @@ -842,6 +849,40 @@ mod test { assert_eq!(mask1.mask, None); } + #[test] + fn test_projection_mask_root_column_selection() { + let schema = parse_schema( + " + message test_schema { + OPTIONAL BYTE_ARRAY tag (UTF8); + OPTIONAL group payload { + REQUIRED INT64 id; + REQUIRED BYTE_ARRAY label (UTF8); + } + REQUIRED INT64 value; + } + ", + ); + + let selection = ProjectionMask::all().root_column_selection(&schema); + assert_eq!(selection.included_indices, [0, 1, 2]); + assert!(selection.selects_whole_roots); + + let selection = ProjectionMask::none(schema.num_columns()).root_column_selection(&schema); + assert!(selection.included_indices.is_empty()); + assert!(selection.selects_whole_roots); + + let selection = + ProjectionMask::columns(&schema, ["payload"]).root_column_selection(&schema); + assert_eq!(selection.included_indices, [1]); + assert!(selection.selects_whole_roots); + + let selection = ProjectionMask::columns(&schema, ["tag", "payload.label"]) + .root_column_selection(&schema); + assert_eq!(selection.included_indices, [0, 1]); + assert!(!selection.selects_whole_roots); + } + #[test] fn test_projection_mask_intersect() { let mut mask1 = ProjectionMask { From 891aa5e7cc6a0e6c7c0a65a8e62c3b57278eefea Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 08:59:47 +0800 Subject: [PATCH 24/32] perf(parquet): start post-filter for cheap projected reads --- parquet/benches/arrow_reader_row_filter.rs | 6 ++ parquet/src/arrow/push_decoder/mod.rs | 86 +++++++++++++++++++ .../push_decoder/reader_builder/cost_model.rs | 67 ++++++++++++++- .../arrow/push_decoder/reader_builder/mod.rs | 2 +- 4 files changed, 157 insertions(+), 4 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 029ccd3ae67e..f28c33382f87 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -908,6 +908,12 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { FilterType::PointLookup, ProjectionCase::FilterColumnsOnly, ), + AsyncFocusCase::new( + "profile_sparse_fixed_deferred_output", + parquet_file.clone(), + FilterType::PointLookup, + ProjectionCase::Float64Only, + ), AsyncFocusCase::new( "profile_sparse_projected_fact_scan", parquet_file.clone(), diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 33f655540287..ba281a1b89af 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1500,6 +1500,78 @@ mod test { assert_eq!(metrics.records_read_from_cache(), Some(100)); } + #[test] + fn test_decoder_auto_cost_model_starts_post_filter_for_fixed_width_read_projection() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(not_multiple_of_three_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_not_multiple_of_three(row_group_idx * 100, 100) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_observes_fixed_width_deferred_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(not_multiple_of_three_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_b_not_multiple_of_three(row_group_idx * 100, 100) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + #[test] fn test_decoder_auto_cost_model_keeps_pushdown_for_sparse_projected_predicate() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -2455,6 +2527,20 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_a_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[0]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_b_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn multiple_of_ten_filter(batch: &RecordBatch) -> BooleanArray { let column = batch.column(0).as_primitive::(); BooleanArray::from( diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 06ddcf398c00..f25678b62ce6 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -73,6 +73,8 @@ impl Default for RowGroupCostModelState { } impl RowGroupReaderBuilder { + const CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW: f64 = 24.0; + pub(super) fn should_use_post_filter_by_cost(&self, budget: RowBudget) -> bool { // Keep the runtime switch narrow: // @@ -112,7 +114,7 @@ impl RowGroupReaderBuilder { self.build_post_filter_read_projection(filter) } - pub(super) fn should_start_with_post_filter_for_unprojected_variable_width_predicate( + pub(super) fn should_start_with_post_filter( &self, filter: &RowFilter, row_group_idx: usize, @@ -125,12 +127,71 @@ impl RowGroupReaderBuilder { let Some(predicate_projection) = filter.union_projection() else { return false; }; - let predicate_already_projected = self.projection_includes_all(&self.projection, &predicate_projection); + self.should_start_with_post_filter_for_unprojected_variable_width_predicate( + &predicate_projection, + predicate_already_projected, + row_group_idx, + ) || self.should_start_with_post_filter_for_cheap_fixed_width_read( + filter, + predicate_already_projected, + row_group_idx, + ) + } + + fn should_start_with_post_filter_for_unprojected_variable_width_predicate( + &self, + predicate_projection: &ProjectionMask, + predicate_already_projected: bool, + row_group_idx: usize, + ) -> bool { !predicate_already_projected - && self.projection_has_variable_width_leaf(row_group_idx, &predicate_projection) + && self.projection_has_variable_width_leaf(row_group_idx, predicate_projection) + } + + fn should_start_with_post_filter_for_cheap_fixed_width_read( + &self, + filter: &RowFilter, + predicate_already_projected: bool, + row_group_idx: usize, + ) -> bool { + // If predicate columns are already in the output projection, pushdown + // cannot save a deferred output read for those columns. For cheap + // fixed-width reads, starting directly with post-filter avoids building + // a row selection just to decode the same values again. + // + // Do not apply this to deferred output columns: sparse predicates can + // still win by reading only a handful of output values. + if !predicate_already_projected { + return false; + } + + let Some(read_projection) = self.build_post_filter_read_projection(filter) else { + return false; + }; + + let row_group = self.metadata.row_group(row_group_idx); + if row_group.num_rows() == 0 { + return false; + } + + let mut projected_uncompressed_bytes = 0u64; + for leaf_idx in 0..row_group.num_columns() { + if !read_projection.leaf_included(leaf_idx) { + continue; + } + + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + return false; + } + projected_uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + projected_uncompressed_bytes as f64 / row_group.num_rows() as f64 + <= Self::CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW } fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 2edc9b777311..148cf57e45bf 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -559,7 +559,7 @@ impl RowGroupReaderBuilder { })); }; - if self.should_start_with_post_filter_for_unprojected_variable_width_predicate( + if self.should_start_with_post_filter( &filter, row_group_info.row_group_idx, row_group_info.budget, From 7db5c9e9ba8a587c2dd68fa6e5eadce3e8e32990 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 09:23:13 +0800 Subject: [PATCH 25/32] test(parquet): update row filter expectations for post-filter auto --- parquet/tests/arrow_reader/io/async_reader.rs | 33 +++++++++---------- parquet/tests/arrow_reader/predicate_cache.rs | 14 ++++++-- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/parquet/tests/arrow_reader/io/async_reader.rs b/parquet/tests/arrow_reader/io/async_reader.rs index db06dda8ee89..b9701cd3ff02 100644 --- a/parquet/tests/arrow_reader/io/async_reader.rs +++ b/parquet/tests/arrow_reader/io/async_reader.rs @@ -178,8 +178,9 @@ async fn test_read_single_row_filter() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Expect to see I/O for column b in both row groups to evaluate filter, - // then a single pages for the "a" column in each row group + // Auto starts with post-filter for this cheap fixed-width projected + // predicate, so each row group reads the projected "a" and "b" columns + // together. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -188,15 +189,11 @@ async fn test_read_single_row_filter() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", + " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", - " Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", - " Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", - "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", - "Read Multi:", - " Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", - " Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", ] "#); } @@ -215,10 +212,9 @@ async fn test_read_single_row_filter_no_page_index() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Since we don't have the page index, expect to see: - // 1. I/O for all pages of column b to evaluate the filter - // 2. IO for all pages of column a as the reader doesn't know where the page - // boundaries are so needs to scan them. + // Auto starts with post-filter for this cheap fixed-width projected + // predicate, so without page indexes each row group reads all pages for + // both projected columns together. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -227,13 +223,11 @@ async fn test_read_single_row_filter_no_page_index() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", - " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", - "Read Multi:", " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", - "Read Multi:", - " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", ] "#); } @@ -295,8 +289,9 @@ async fn test_read_single_row_filter_all() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_false(&schema_descr)); - // Expect to see reads for column "b" to evaluate the filter, but no reads - // for column "a" as no rows pass the filter + // Auto starts with post-filter for this cheap fixed-width projected + // predicate, so it reads both projected columns even though the filter + // later rejects all rows. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -305,8 +300,10 @@ async fn test_read_single_row_filter_all() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", + " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", ] "#); diff --git a/parquet/tests/arrow_reader/predicate_cache.rs b/parquet/tests/arrow_reader/predicate_cache.rs index 85dba68c9c69..07781286001b 100644 --- a/parquet/tests/arrow_reader/predicate_cache.rs +++ b/parquet/tests/arrow_reader/predicate_cache.rs @@ -29,7 +29,9 @@ use arrow_schema::{DataType, Field}; use bytes::Bytes; use futures::StreamExt; use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; -use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; +use parquet::arrow::arrow_reader::{ + ArrowPredicateFn, ArrowReaderOptions, RowFilter, RowSelectionPolicy, +}; use parquet::arrow::arrow_reader::{ArrowReaderBuilder, ParquetRecordBatchReaderBuilder}; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::properties::WriterProperties; @@ -49,7 +51,15 @@ async fn test_default_read() { #[tokio::test] async fn test_async_cache_with_filters() { let test = ParquetPredicateCacheTest::new().with_expected_records_read_from_cache(49); - let async_builder = test.async_builder().await.add_project_ab_and_filter_b(); + let async_builder = test + .async_builder() + .await + .add_project_ab_and_filter_b() + // The default Auto policy may choose post-filter execution for this + // cheap projected predicate, which avoids the predicate cache entirely. + // Use an explicit pushdown policy so this test continues to exercise + // predicate cache reads. + .with_row_selection_policy(RowSelectionPolicy::Selectors); test.run_async(async_builder).await; } From f8e9e478dbcc721456eec784a8ecef80a97bd866 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 09:29:45 +0800 Subject: [PATCH 26/32] fix(parquet): satisfy clippy and rustdoc checks --- parquet/benches/arrow_reader_row_filter.rs | 2 +- parquet/src/arrow/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index f28c33382f87..e765a4cc01a9 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -214,7 +214,7 @@ fn write_record_batch_to_parquet(batch: &RecordBatch, row_group_size: usize) -> let mut buffer = vec![]; { let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); - writer.write(&batch).unwrap(); + writer.write(batch).unwrap(); writer.close().unwrap(); } buffer diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index b940cbc6cdc6..92d411d7dc5d 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -392,7 +392,7 @@ impl ProjectionMask { /// Returns true if each top-level root column is either fully selected or /// fully skipped. /// - /// This is useful for code paths that project decoded [`RecordBatch`] + /// This is useful for code paths that project decoded [`arrow_array::RecordBatch`] /// values by top-level Arrow field index. A full `struct` root can be moved /// as one batch column, but selecting only `struct.child` would require /// recursively trimming the nested array. @@ -404,7 +404,7 @@ impl ProjectionMask { /// /// This intentionally combines the included-root list and whole-root check /// in one leaf scan. Post-filter planning needs both values when converting - /// parquet projection masks to decoded [`RecordBatch`] column indices. + /// parquet projection masks to decoded [`arrow_array::RecordBatch`] column indices. pub(crate) fn root_column_selection(&self, schema: &SchemaDescriptor) -> RootColumnSelection { let num_roots = schema.root_schema().get_fields().len(); let mut root_leaf_counts = vec![0usize; num_roots]; From b024d07e7c748b8b2dc732ba3865e37fc309eb05 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 14:59:53 +0800 Subject: [PATCH 27/32] fix(parquet): observe cacheable projected predicates --- parquet/benches/arrow_reader_row_filter.rs | 27 ++++- parquet/src/arrow/push_decoder/mod.rs | 106 +++++++++++++++++- .../push_decoder/reader_builder/cost_model.rs | 16 +++ 3 files changed, 144 insertions(+), 5 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index f28c33382f87..5ddef53d82fd 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -258,6 +258,7 @@ enum ProjectionCase { AllColumns, ExcludeFilterColumn, FilterColumnsOnly, + FixedColumns, Float64Only, Utf8Only, } @@ -268,6 +269,7 @@ impl std::fmt::Display for ProjectionCase { ProjectionCase::AllColumns => write!(f, "all_columns"), ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), ProjectionCase::FilterColumnsOnly => write!(f, "filter_columns_only"), + ProjectionCase::FixedColumns => write!(f, "fixed_columns"), ProjectionCase::Float64Only => write!(f, "float64_only"), ProjectionCase::Utf8Only => write!(f, "utf8_only"), } @@ -443,6 +445,9 @@ enum FilterType { /// subqueries. The selected rows are random and moderately selective, and /// benchmark projections cover both count-only and numeric aggregate cases. TpcdsQ9QuantityRange, + /// Very sparse projected fixed-width scan shaped like TPC-DS fact-table + /// filters where the predicate column is also needed in the output projection. + TpcdsSparseProjectedFactScan, } impl std::fmt::Display for FilterType { @@ -459,6 +464,7 @@ impl std::fmt::Display for FilterType { FilterType::Utf8ViewMissing => "utf8View == ''", FilterType::ClickBenchQ37ScalarPrefix => "int64 == 62 AND ts < 9000", FilterType::TpcdsQ9QuantityRange => "int64 > 0 AND int64 < 21", + FilterType::TpcdsSparseProjectedFactScan => "ts % 1000 == 0", }; write!(f, "{s}") } @@ -533,6 +539,19 @@ impl FilterType { let upper = lt(int64, &Int64Array::new_scalar(21))?; and(&lower, &upper) } + FilterType::TpcdsSparseProjectedFactScan => { + let ts = batch + .column(batch.schema().index_of("ts")?) + .as_any() + .downcast_ref::() + .unwrap(); + Ok(BooleanArray::from( + ts.values() + .iter() + .map(|value| value % 1000 == 0) + .collect::>(), + )) + } } } @@ -549,6 +568,7 @@ impl FilterType { FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewMissing => &[2], FilterType::ClickBenchQ37ScalarPrefix => &[0, 3], FilterType::TpcdsQ9QuantityRange => &[0], + FilterType::TpcdsSparseProjectedFactScan => &[3], } } } @@ -915,10 +935,10 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { ProjectionCase::Float64Only, ), AsyncFocusCase::new( - "profile_sparse_projected_fact_scan", + "profile_tpcds_sparse_projected_fact_scan", parquet_file.clone(), - FilterType::PointLookup, - ProjectionCase::AllColumns, + FilterType::TpcdsSparseProjectedFactScan, + ProjectionCase::FixedColumns, ), AsyncFocusCase::new( "profile_q83_sparse_utf8_projected", @@ -1137,6 +1157,7 @@ fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCa }) .collect(), ProjectionCase::FilterColumnsOnly => filter_columns.to_vec(), + ProjectionCase::FixedColumns => vec![0, 1, 3], ProjectionCase::Float64Only => vec![1], ProjectionCase::Utf8Only => vec![2], } diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index ba281a1b89af..49fe9957b1f6 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1501,7 +1501,7 @@ mod test { } #[test] - fn test_decoder_auto_cost_model_starts_post_filter_for_fixed_width_read_projection() { + fn test_decoder_auto_cost_model_uses_post_filter_after_observing_fixed_width_read_projection() { let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); @@ -1530,12 +1530,51 @@ mod test { ); } - assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); assert!(next_batch_with_data(&mut decoder, data).is_none()); } + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_sparse_fixed_width_read_projection() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 1)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_first_rows_per_hundred(row_group_idx * 100, 100, 1) + ); + } + + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "sparse projected pushdown should consume the predicate cache" + ); + } + #[test] fn test_decoder_auto_cost_model_observes_fixed_width_deferred_output() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -1606,6 +1645,58 @@ mod test { assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "sparse projected pushdown should consume the predicate cache" + ); + } + + #[test] + fn test_decoder_auto_cost_model_reuses_cache_for_very_sparse_projected_predicate_chain() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let sparse_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 1)), + ); + let cache_reusing_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![ + Box::new(sparse_filter_a), + Box::new(cache_reusing_filter_a), + ])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 1) + ); + } + + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "projected predicate chains should reuse cached predicate data" + ); } #[test] @@ -2534,6 +2625,17 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_a_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn expected_b_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { let batch = TEST_BATCH.slice(offset, len); let filter = not_multiple_of_three_filter(&batch); diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index f25678b62ce6..768a8a612a93 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -168,6 +168,13 @@ impl RowGroupReaderBuilder { return false; } + // Cacheable predicate columns need one pushdown row group to reveal + // whether selection is sparse. Starting post-filter here bypasses the + // predicate cache before the adaptive model can observe that shape. + if self.has_cacheable_projected_predicate(filter) { + return false; + } + let Some(read_projection) = self.build_post_filter_read_projection(filter) else { return false; }; @@ -194,6 +201,15 @@ impl RowGroupReaderBuilder { <= Self::CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW } + fn has_cacheable_projected_predicate(&self, filter: &RowFilter) -> bool { + let Some(cache_projection) = self.compute_cache_projection_inner(filter) else { + return false; + }; + + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()).any(|leaf_idx| cache_projection.leaf_included(leaf_idx)) + } + fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { // Post-filter execution decodes each row once, so it needs both: // From 51d5abe5f415ec0b9d88e5de40cf26823e7e846e Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 15:17:23 +0800 Subject: [PATCH 28/32] test(parquet): update async row filter snapshots --- parquet/tests/arrow_reader/io/async_reader.rs | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/parquet/tests/arrow_reader/io/async_reader.rs b/parquet/tests/arrow_reader/io/async_reader.rs index b9701cd3ff02..8fa993fd50bd 100644 --- a/parquet/tests/arrow_reader/io/async_reader.rs +++ b/parquet/tests/arrow_reader/io/async_reader.rs @@ -178,9 +178,9 @@ async fn test_read_single_row_filter() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Auto starts with post-filter for this cheap fixed-width projected - // predicate, so each row group reads the projected "a" and "b" columns - // together. + // Auto keeps pushdown for projected predicates so the filtered "b" column + // can be reused from cache. The remaining projected "a" column is read + // after filtering, trimmed to the matching pages by the page index. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -189,11 +189,15 @@ async fn test_read_single_row_filter() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", - " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", - " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + "Read Multi:", " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", ] "#); } @@ -212,9 +216,8 @@ async fn test_read_single_row_filter_no_page_index() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Auto starts with post-filter for this cheap fixed-width projected - // predicate, so without page indexes each row group reads all pages for - // both projected columns together. + // Without page indexes, auto still evaluates and caches the projected + // predicate first, then reads the remaining projected column separately. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -223,11 +226,13 @@ async fn test_read_single_row_filter_no_page_index() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", - " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", - " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + "Read Multi:", " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", ] "#); } @@ -289,9 +294,8 @@ async fn test_read_single_row_filter_all() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_false(&schema_descr)); - // Auto starts with post-filter for this cheap fixed-width projected - // predicate, so it reads both projected columns even though the filter - // later rejects all rows. + // Auto keeps pushdown for projected predicates, so the non-predicate + // column is not read when the predicate rejects every row. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -300,10 +304,8 @@ async fn test_read_single_row_filter_all() { "Event: Builder Configured", "Event: Reader Built", "Read Multi:", - " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", "Read Multi:", - " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", ] "#); From ef1fa0b91a4539db861a97b8e94f5474808dde08 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 22 May 2026 16:15:22 +0800 Subject: [PATCH 29/32] Refactor row filter execution planning --- .../push_decoder/reader_builder/cost_model.rs | 110 +++++++++++------- .../arrow/push_decoder/reader_builder/mod.rs | 67 ++++++++--- 2 files changed, 117 insertions(+), 60 deletions(-) diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 768a8a612a93..ab5ef406fd84 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -72,10 +72,25 @@ impl Default for RowGroupCostModelState { } } +#[derive(Debug)] +struct PostFilterProjectionRoles { + /// Columns required to evaluate all predicates. + predicate_projection: ProjectionMask, + /// Columns decoded by post-filter execution. + read_projection: ProjectionMask, + /// True when predicate columns are already part of the caller output. + predicate_already_projected: bool, +} + impl RowGroupReaderBuilder { const CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW: f64 = 24.0; pub(super) fn should_use_post_filter_by_cost(&self, budget: RowBudget) -> bool { + matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) + && self.post_filter_context_supported(budget) + } + + fn post_filter_context_supported(&self, budget: RowBudget) -> bool { // Keep the runtime switch narrow: // // * `Auto` means the caller allowed the reader to choose. @@ -83,8 +98,7 @@ impl RowGroupReaderBuilder { // predicates after decode changes where short-circuiting can happen. // * virtual columns are not read from Parquet pages and need their // existing projection path. - matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) - && self.post_filter_cost_model_enabled + self.post_filter_cost_model_enabled && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) && budget.is_unbounded() && !self.has_virtual_columns() @@ -99,7 +113,7 @@ impl RowGroupReaderBuilder { return None; } - self.build_post_filter_read_projection(filter) + Some(self.post_filter_projection_roles(filter)?.read_projection) } pub(super) fn post_filter_read_projection_for_filter( @@ -107,11 +121,11 @@ impl RowGroupReaderBuilder { filter: &RowFilter, budget: RowBudget, ) -> Option { - if !self.post_filter_supports_filter(filter, budget) { + if !self.post_filter_context_supported(budget) { return None; } - self.build_post_filter_read_projection(filter) + Some(self.post_filter_projection_roles(filter)?.read_projection) } pub(super) fn should_start_with_post_filter( @@ -120,41 +134,40 @@ impl RowGroupReaderBuilder { row_group_idx: usize, budget: RowBudget, ) -> bool { - if !self.post_filter_supports_filter(filter, budget) { + if !self.post_filter_context_supported(budget) { return false; } - let Some(predicate_projection) = filter.union_projection() else { + let Some(projections) = self.post_filter_projection_roles(filter) else { return false; }; - let predicate_already_projected = - self.projection_includes_all(&self.projection, &predicate_projection); self.should_start_with_post_filter_for_unprojected_variable_width_predicate( - &predicate_projection, - predicate_already_projected, + &projections, row_group_idx, ) || self.should_start_with_post_filter_for_cheap_fixed_width_read( filter, - predicate_already_projected, + &projections, row_group_idx, ) } fn should_start_with_post_filter_for_unprojected_variable_width_predicate( &self, - predicate_projection: &ProjectionMask, - predicate_already_projected: bool, + projections: &PostFilterProjectionRoles, row_group_idx: usize, ) -> bool { - !predicate_already_projected - && self.projection_has_variable_width_leaf(row_group_idx, predicate_projection) + !projections.predicate_already_projected + && self.projection_has_variable_width_leaf( + row_group_idx, + &projections.predicate_projection, + ) } fn should_start_with_post_filter_for_cheap_fixed_width_read( &self, filter: &RowFilter, - predicate_already_projected: bool, + projections: &PostFilterProjectionRoles, row_group_idx: usize, ) -> bool { // If predicate columns are already in the output projection, pushdown @@ -164,7 +177,7 @@ impl RowGroupReaderBuilder { // // Do not apply this to deferred output columns: sparse predicates can // still win by reading only a handful of output values. - if !predicate_already_projected { + if !projections.predicate_already_projected { return false; } @@ -175,10 +188,6 @@ impl RowGroupReaderBuilder { return false; } - let Some(read_projection) = self.build_post_filter_read_projection(filter) else { - return false; - }; - let row_group = self.metadata.row_group(row_group_idx); if row_group.num_rows() == 0 { return false; @@ -186,7 +195,7 @@ impl RowGroupReaderBuilder { let mut projected_uncompressed_bytes = 0u64; for leaf_idx in 0..row_group.num_columns() { - if !read_projection.leaf_included(leaf_idx) { + if !projections.read_projection.leaf_included(leaf_idx) { continue; } @@ -210,7 +219,10 @@ impl RowGroupReaderBuilder { (0..schema.num_columns()).any(|leaf_idx| cache_projection.leaf_included(leaf_idx)) } - fn build_post_filter_read_projection(&self, filter: &RowFilter) -> Option { + fn post_filter_projection_roles( + &self, + filter: &RowFilter, + ) -> Option { // Post-filter execution decodes each row once, so it needs both: // // * output columns, which will be returned to the caller @@ -218,14 +230,38 @@ impl RowGroupReaderBuilder { // // The final reader projects back to the original output projection // after predicate evaluation. + let predicate_projection = filter.union_projection()?; let mut read_projection = self.projection.clone(); - read_projection.union(&filter.union_projection()?); + read_projection.union(&predicate_projection); - if self.post_filter_supports_batch_projection(&read_projection) { - Some(read_projection) - } else { - None + if !self.post_filter_supports_batch_projection(&self.projection) { + return None; + } + + // The combined read projection may be whole-root even when an individual + // predicate asks for one nested child that is completed by the output + // projection. Check every batch projection that `PostFilterState` will + // materialize, not only their union. + if !filter + .predicates() + .iter() + .all(|predicate| self.post_filter_supports_batch_projection(predicate.projection())) + { + return None; } + + if !self.post_filter_supports_batch_projection(&read_projection) { + return None; + } + + let predicate_already_projected = + self.projection_includes_all(&self.projection, &predicate_projection); + + Some(PostFilterProjectionRoles { + predicate_projection, + read_projection, + predicate_already_projected, + }) } fn post_filter_supports_batch_projection(&self, projection: &ProjectionMask) -> bool { @@ -353,20 +389,8 @@ impl RowGroupReaderBuilder { } fn post_filter_supports_filter(&self, filter: &RowFilter, budget: RowBudget) -> bool { - self.post_filter_cost_model_enabled - && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) - && budget.is_unbounded() - && !self.has_virtual_columns() - && self.post_filter_supports_batch_projection(&self.projection) - // The combined read projection may be whole-root even when an - // individual predicate asks for one nested child that is completed - // by the output projection. Check every batch projection that - // `PostFilterState` will materialize, not only their union. - && filter - .predicates() - .iter() - .all(|predicate| self.post_filter_supports_batch_projection(predicate.projection())) - && self.build_post_filter_read_projection(filter).is_some() + self.post_filter_context_supported(budget) + && self.post_filter_projection_roles(filter).is_some() } fn has_virtual_columns(&self) -> bool { diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index d496daad56c7..476f1ececc44 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -68,6 +68,16 @@ enum CostModelTransition { }, } +enum FilterExecutionPlan { + /// No predicate work remains for this row group; proceed to output planning. + ReadOutput, + /// Decode the union of output and predicate columns once, then evaluate + /// predicates on decoded batches. + PostFilter { filter: Arc> }, + /// Decode predicate columns first, build a RowSelection, then read output. + Pushdown { filter_info: FilterInfo }, +} + /// This is the inner state machine for reading a single row group. /// /// The top-level flow is: @@ -621,22 +631,44 @@ impl RowGroupReaderBuilder { })); }; + match self.plan_filter_execution(&row_group_info, filter) { + FilterExecutionPlan::ReadOutput => { + Ok(NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: None, + })) + } + FilterExecutionPlan::PostFilter { filter } => { + self.start_post_filter(row_group_info, filter) + } + FilterExecutionPlan::Pushdown { filter_info } => { + Ok(NextState::again(RowGroupDecoderState::Filters { + row_group_info, + filter_info, + column_chunks, + })) + } + } + } + + fn plan_filter_execution( + &mut self, + row_group_info: &RowGroupInfo, + filter: RowFilter, + ) -> FilterExecutionPlan { if filter.predicates.is_empty() { - return Ok(NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: None, - })); - }; + return FilterExecutionPlan::ReadOutput; + } if self.should_start_with_post_filter( &filter, row_group_info.row_group_idx, row_group_info.budget, ) { - let filter = Arc::new(Mutex::new(filter)); - self.post_filter = Some(Arc::clone(&filter)); - return self.start_post_filter(row_group_info, filter); + return FilterExecutionPlan::PostFilter { + filter: self.install_post_filter(filter), + }; } if self.should_use_post_filter_by_cost(row_group_info.budget) { @@ -644,9 +676,9 @@ impl RowGroupReaderBuilder { .post_filter_read_projection(&filter, row_group_info.budget) .is_some() { - let filter = Arc::new(Mutex::new(filter)); - self.post_filter = Some(Arc::clone(&filter)); - return self.start_post_filter(row_group_info, filter); + return FilterExecutionPlan::PostFilter { + filter: self.install_post_filter(filter), + }; } self.cost_model_state = RowGroupCostModelState::UsePushdown; @@ -661,12 +693,13 @@ impl RowGroupReaderBuilder { ))), ); let filter_info = FilterInfo::new(filter, cache_info); + FilterExecutionPlan::Pushdown { filter_info } + } - Ok(NextState::again(RowGroupDecoderState::Filters { - row_group_info, - filter_info, - column_chunks, - })) + fn install_post_filter(&mut self, filter: RowFilter) -> Arc> { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + filter } fn transition_filters( From d6838a1e0e88ddbe014184a09a86eb4650b68bd7 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Mon, 25 May 2026 11:12:57 +0800 Subject: [PATCH 30/32] Add row filter regression benchmarks --- parquet/benches/arrow_reader_row_filter.rs | 310 +++++++++++++++++- parquet/src/arrow/push_decoder/mod.rs | 161 +++++++++ .../push_decoder/reader_builder/cost_model.rs | 5 + 3 files changed, 472 insertions(+), 4 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 6e4294affb16..59e6361cac3a 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -55,8 +55,8 @@ use arrow::array::{ ArrayRef, BooleanArray, Float64Array, Int64Array, StructArray, TimestampMillisecondArray, }; -use arrow::compute::and; use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; +use arrow::compute::{and, or}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::StringViewArray; @@ -258,8 +258,11 @@ enum ProjectionCase { AllColumns, ExcludeFilterColumn, FilterColumnsOnly, + CountOnly, FixedColumns, + Float64AndTs, Float64Only, + Int64AndFloat64, Utf8Only, } @@ -269,8 +272,11 @@ impl std::fmt::Display for ProjectionCase { ProjectionCase::AllColumns => write!(f, "all_columns"), ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), ProjectionCase::FilterColumnsOnly => write!(f, "filter_columns_only"), + ProjectionCase::CountOnly => write!(f, "count_only"), ProjectionCase::FixedColumns => write!(f, "fixed_columns"), + ProjectionCase::Float64AndTs => write!(f, "float64_and_ts"), ProjectionCase::Float64Only => write!(f, "float64_only"), + ProjectionCase::Int64AndFloat64 => write!(f, "int64_and_float64"), ProjectionCase::Utf8Only => write!(f, "utf8_only"), } } @@ -441,6 +447,34 @@ enum FilterType { /// This synthetic predicate keeps that reader-level shape: cheap scalar /// filter columns protect an expensive `Utf8View` output column. ClickBenchQ37ScalarPrefix, + /// Shape of ClickBench extended Q6 under DataFusion row-filter pushdown: + /// an early cheap fixed-width predicate can prune almost all rows before a + /// later unprojected variable-width predicate is decoded. + ClickBenchQ6MixedPredicates, + /// Shape of ClickBench Q41-like fixed-width filters: sparse fragmented + /// scalar predicates with a cheap fixed-width output projection. + ClickBenchQ41SparseFixedOutput, + /// Shape of ClickBench Q40: multiple cheap scalar predicates, very small + /// output, and one projected predicate column used later by grouping. + ClickBenchQ40ScalarGroupBy, + /// Shape of TPC-DS Q41: a complex OR predicate over dictionary/string-like + /// and scalar columns where predicate evaluation dominates reader time. + TpcdsQ41ComplexOr, + /// Shape of TPC-DS Q20 catalog_sales after dynamic filters: multiple + /// fixed-width predicates where predicate columns are also projected. + TpcdsQ20ProjectedDynamicFilters, + /// Shape of TPC-DS Q21 after dynamic-filter pruning: sparse fragmented + /// fixed-width predicates where the final projection still includes the + /// predicate columns. This protects against choosing selectors for columns + /// that were already decoded/cached by predicate evaluation. + TpcdsQ21ProjectedFixedOutput, + /// Shape of TPC-DS Q2 fact scans: the dynamic filter applies to the date + /// key, the same date key is projected, and an additional fixed-width sales + /// value can still be deferred by predicate pushdown. + TpcdsQ2ProjectedPredicate10Pct, + TpcdsQ2ProjectedPredicate20Pct, + TpcdsQ2ProjectedPredicate30Pct, + TpcdsQ2ProjectedPredicate40Pct, /// Scalar range predicate shaped like TPC-DS Q9 `ss_quantity BETWEEN ...` /// subqueries. The selected rows are random and moderately selective, and /// benchmark projections cover both count-only and numeric aggregate cases. @@ -463,6 +497,32 @@ impl std::fmt::Display for FilterType { FilterType::Utf8ViewNonEmpty => "utf8View <> ''", FilterType::Utf8ViewMissing => "utf8View == ''", FilterType::ClickBenchQ37ScalarPrefix => "int64 == 62 AND ts < 9000", + FilterType::ClickBenchQ6MixedPredicates => "int64 == 9999 AND utf8View <> ''", + FilterType::ClickBenchQ41SparseFixedOutput => "int64 < 8 AND ts < 9000", + FilterType::ClickBenchQ40ScalarGroupBy => { + "int64 == 62 AND float64 > 10.0 AND ts < 9000" + } + FilterType::TpcdsQ41ComplexOr => { + "(utf8View <> '' AND int64 < 8) OR (ts < 100 AND float64 > 95.0)" + } + FilterType::TpcdsQ20ProjectedDynamicFilters => { + "int64 < 12 AND ts < 9000 projected dynamic filters" + } + FilterType::TpcdsQ21ProjectedFixedOutput => { + "int64 < 8 AND ts < 9000 projected predicates" + } + FilterType::TpcdsQ2ProjectedPredicate10Pct => { + "int64 < 10 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate20Pct => { + "int64 < 20 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate30Pct => { + "int64 < 30 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate40Pct => { + "int64 < 40 projected predicate with fixed output" + } FilterType::TpcdsQ9QuantityRange => "int64 > 0 AND int64 < 21", FilterType::TpcdsSparseProjectedFactScan => "ts % 1000 == 0", }; @@ -533,6 +593,66 @@ impl FilterType { let date_like_range = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; and(&counter_match, &date_like_range) } + FilterType::ClickBenchQ6MixedPredicates => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + let cheap_prefix = eq(int64, &Int64Array::new_scalar(9999))?; + let string_suffix = neq(utf8, &StringViewArray::new_scalar(""))?; + and(&cheap_prefix, &string_suffix) + } + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ21ProjectedFixedOutput => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_like = lt(int64, &Int64Array::new_scalar(8))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&counter_like, &date_like) + } + FilterType::ClickBenchQ40ScalarGroupBy => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let float64 = batch.column(batch.schema().index_of("float64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_match = eq(int64, &Int64Array::new_scalar(62))?; + let width_match = gt(float64, &Float64Array::new_scalar(10.0))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&and(&counter_match, &width_match)?, &date_like) + } + FilterType::TpcdsQ41ComplexOr => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let float64 = batch.column(batch.schema().index_of("float64")?); + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let string_branch = and( + &neq(utf8, &StringViewArray::new_scalar(""))?, + <(int64, &Int64Array::new_scalar(8))?, + )?; + let scalar_branch = and( + <(ts, &TimestampMillisecondArray::new_scalar(100))?, + >(float64, &Float64Array::new_scalar(95.0))?, + )?; + or(&string_branch, &scalar_branch) + } + FilterType::TpcdsQ20ProjectedDynamicFilters => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let item_like = lt(int64, &Int64Array::new_scalar(12))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&item_like, &date_like) + } + FilterType::TpcdsQ2ProjectedPredicate10Pct + | FilterType::TpcdsQ2ProjectedPredicate20Pct + | FilterType::TpcdsQ2ProjectedPredicate30Pct + | FilterType::TpcdsQ2ProjectedPredicate40Pct => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let threshold = match self { + FilterType::TpcdsQ2ProjectedPredicate10Pct => 10, + FilterType::TpcdsQ2ProjectedPredicate20Pct => 20, + FilterType::TpcdsQ2ProjectedPredicate30Pct => 30, + FilterType::TpcdsQ2ProjectedPredicate40Pct => 40, + _ => unreachable!(), + }; + lt(int64, &Int64Array::new_scalar(threshold)) + } FilterType::TpcdsQ9QuantityRange => { let int64 = batch.column(batch.schema().index_of("int64")?); let lower = gt(int64, &Int64Array::new_scalar(0))?; @@ -567,6 +687,16 @@ impl FilterType { FilterType::Composite => &[1, 3], // Use float64 column and ts column as representative for composite FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewMissing => &[2], FilterType::ClickBenchQ37ScalarPrefix => &[0, 3], + FilterType::ClickBenchQ6MixedPredicates => &[0, 2], + FilterType::ClickBenchQ40ScalarGroupBy => &[0, 1, 3], + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ20ProjectedDynamicFilters + | FilterType::TpcdsQ21ProjectedFixedOutput => &[0, 3], + FilterType::TpcdsQ41ComplexOr => &[0, 1, 2, 3], + FilterType::TpcdsQ2ProjectedPredicate10Pct + | FilterType::TpcdsQ2ProjectedPredicate20Pct + | FilterType::TpcdsQ2ProjectedPredicate30Pct + | FilterType::TpcdsQ2ProjectedPredicate40Pct => &[0], FilterType::TpcdsQ9QuantityRange => &[0], FilterType::TpcdsSparseProjectedFactScan => &[3], } @@ -922,6 +1052,72 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { FilterType::ClickBenchQ37ScalarPrefix, ProjectionCase::Utf8Only, ), + AsyncFocusCase::new( + "profile_q6_mixed_predicates", + parquet_file.clone(), + FilterType::ClickBenchQ6MixedPredicates, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_q40_scalar_group_by", + parquet_file.clone(), + FilterType::ClickBenchQ40ScalarGroupBy, + ProjectionCase::Float64AndTs, + ), + AsyncFocusCase::new( + "profile_q41_sparse_fixed_output", + parquet_file.clone(), + FilterType::ClickBenchQ41SparseFixedOutput, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_tpcds_q41_complex_or", + parquet_file.clone(), + FilterType::TpcdsQ41ComplexOr, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_tpcds_q20_projected_dynamic_filters", + parquet_file.clone(), + FilterType::TpcdsQ20ProjectedDynamicFilters, + ProjectionCase::FixedColumns, + ), + AsyncFocusCase::new( + "profile_q21_projected_predicate_fixed_output", + parquet_file.clone(), + FilterType::TpcdsQ21ProjectedFixedOutput, + ProjectionCase::FixedColumns, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_10pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate10Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_20pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate20Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_30pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate30Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_40pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate40Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q1_count_only", + parquet_file.clone(), + FilterType::ClickBenchQ41SparseFixedOutput, + ProjectionCase::CountOnly, + ), AsyncFocusCase::new( "profile_q19_no_defer", parquet_file.clone(), @@ -1080,6 +1276,11 @@ fn benchmark_async_focus_case( schema_descr, filter_type.filter_projection().iter().copied(), ); + let q6_int64_pred_mask = ProjectionMask::roots(schema_descr, [0]); + let q6_utf8_pred_mask = ProjectionMask::roots(schema_descr, [2]); + let q41_int64_pred_mask = ProjectionMask::roots(schema_descr, [0]); + let q41_ts_pred_mask = ProjectionMask::roots(schema_descr, [3]); + let q40_float64_pred_mask = ProjectionMask::roots(schema_descr, [1]); for strategy in strategies.iter().copied() { let bench_id = BenchmarkId::new( @@ -1092,6 +1293,11 @@ fn benchmark_async_focus_case( b.iter(|| { let reader = reader.clone(); let pred_mask = pred_mask.clone(); + let q6_int64_pred_mask = q6_int64_pred_mask.clone(); + let q6_utf8_pred_mask = q6_utf8_pred_mask.clone(); + let q41_int64_pred_mask = q41_int64_pred_mask.clone(); + let q41_ts_pred_mask = q41_ts_pred_mask.clone(); + let q40_float64_pred_mask = q40_float64_pred_mask.clone(); let projection_mask = projection_mask.clone(); let read_projection_mask = read_projection_mask.clone(); let output_column_names = output_column_names.clone(); @@ -1108,7 +1314,15 @@ fn benchmark_async_focus_case( .await } AsyncStrategy::PushdownAutoCostModel => { - let row_filter = row_filter_for(filter_type, pred_mask); + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); benchmark_async_reader_with_policy( reader, projection_mask, @@ -1118,7 +1332,15 @@ fn benchmark_async_focus_case( .await } AsyncStrategy::PushdownSelectors => { - let row_filter = row_filter_for(filter_type, pred_mask); + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); benchmark_async_reader_with_policy( reader, projection_mask, @@ -1128,7 +1350,15 @@ fn benchmark_async_focus_case( .await } AsyncStrategy::PushdownMask => { - let row_filter = row_filter_for(filter_type, pred_mask); + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); benchmark_async_reader_with_policy( reader, projection_mask, @@ -1157,8 +1387,11 @@ fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCa }) .collect(), ProjectionCase::FilterColumnsOnly => filter_columns.to_vec(), + ProjectionCase::CountOnly => vec![], ProjectionCase::FixedColumns => vec![0, 1, 3], + ProjectionCase::Float64AndTs => vec![1, 3], ProjectionCase::Float64Only => vec![1], + ProjectionCase::Int64AndFloat64 => vec![0, 1], ProjectionCase::Utf8Only => vec![2], } } @@ -1186,6 +1419,75 @@ fn row_filter_for(filter_type: FilterType, pred_mask: ProjectionMask) -> RowFilt RowFilter::new(vec![Box::new(filter)]) } +fn row_filter_for_focus_case( + filter_type: FilterType, + pred_mask: ProjectionMask, + q6_int64_pred_mask: ProjectionMask, + q6_utf8_pred_mask: ProjectionMask, + q41_int64_pred_mask: ProjectionMask, + q41_ts_pred_mask: ProjectionMask, + q40_float64_pred_mask: ProjectionMask, +) -> RowFilter { + match filter_type { + FilterType::ClickBenchQ6MixedPredicates => { + let int64_filter = + ArrowPredicateFn::new(q6_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + eq(int64, &Int64Array::new_scalar(9999)) + }); + let utf8_filter = + ArrowPredicateFn::new(q6_utf8_pred_mask, move |batch: RecordBatch| { + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + neq(utf8, &StringViewArray::new_scalar("")) + }); + + RowFilter::new(vec![Box::new(int64_filter), Box::new(utf8_filter)]) + } + FilterType::ClickBenchQ40ScalarGroupBy => { + let int64_filter = + ArrowPredicateFn::new(q41_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + eq(int64, &Int64Array::new_scalar(62)) + }); + let float64_filter = + ArrowPredicateFn::new(q40_float64_pred_mask, move |batch: RecordBatch| { + let float64 = batch.column(batch.schema().index_of("float64")?); + gt(float64, &Float64Array::new_scalar(10.0)) + }); + let ts_filter = ArrowPredicateFn::new(q41_ts_pred_mask, move |batch: RecordBatch| { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(9000)) + }); + + RowFilter::new(vec![ + Box::new(int64_filter), + Box::new(float64_filter), + Box::new(ts_filter), + ]) + } + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ20ProjectedDynamicFilters + | FilterType::TpcdsQ21ProjectedFixedOutput => { + let int64_filter = + ArrowPredicateFn::new(q41_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + let scalar = match filter_type { + FilterType::TpcdsQ20ProjectedDynamicFilters => 12, + _ => 8, + }; + lt(int64, &Int64Array::new_scalar(scalar)) + }); + let ts_filter = ArrowPredicateFn::new(q41_ts_pred_mask, move |batch: RecordBatch| { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(9000)) + }); + + RowFilter::new(vec![Box::new(int64_filter), Box::new(ts_filter)]) + } + _ => row_filter_for(filter_type, pred_mask), + } +} + #[derive(Clone, Copy)] enum NestedFilterType { AlwaysTrueTag, diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 0904c816363b..4fb303e89330 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1721,6 +1721,87 @@ mod test { assert!(next_batch_with_data(&mut decoder, data).is_none()); } + #[test] + fn test_decoder_auto_cost_model_switches_for_moderate_fixed_width_deferred_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(multiple_of_five_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(batch, expected_b_multiple_of_five(row_group_idx * 100, 100)); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_switches_for_partially_projected_fixed_width_chain() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(multiple_of_ten_filter(&batch)), + ); + let projected_filter_b = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["b"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![ + Box::new(filter_a), + Box::new(projected_filter_b), + ])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(batch, expected_b_multiple_of_ten(row_group_idx * 100, 100)); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + #[test] fn test_decoder_auto_cost_model_switches_for_projected_predicate_after_observation() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -1767,6 +1848,52 @@ mod test { assert_eq!(metrics.records_read_from_cache(), Some(100)); } + #[test] + fn test_decoder_auto_cost_model_switches_for_projected_predicate_with_deferred_fixed_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 20)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_b_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(1) + ); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "projected predicate should still reuse cached predicate data" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + #[test] fn test_decoder_auto_cost_model_uses_post_filter_after_observing_fixed_width_read_projection() { let data = &COST_MODEL_TEST_FILE_DATA; @@ -3193,6 +3320,13 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_b_multiple_of_ten(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_ten_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn multiple_of_ten_filter(batch: &RecordBatch) -> BooleanArray { let column = batch.column(0).as_primitive::(); BooleanArray::from( @@ -3202,6 +3336,22 @@ mod test { ) } + fn multiple_of_five_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 5 == 0) + .collect::>(), + ) + } + + fn expected_b_multiple_of_five(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_five_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn expected_c_multiple_of_ten(offset: usize, len: usize) -> RecordBatch { let batch = TEST_BATCH.slice(offset, len); let filter = multiple_of_ten_filter(&batch); @@ -3229,6 +3379,17 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_a_b_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0, 1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + fn expected_c_every_other(offset: usize, len: usize) -> RecordBatch { let batch = TEST_BATCH.slice(offset, len); let filter = BooleanArray::from((0..len).map(|idx| idx % 2 == 0).collect::>()); diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index ab5ef406fd84..38040bff3651 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -370,6 +370,11 @@ impl RowGroupReaderBuilder { // Projected predicates can reuse decoded predicate values, but sparse // or clustered filters can still win with page pruning. Keep this // shortcut to moderate selectivity before switching to post-filter. + // + // A TPC-DS Q2-shaped projected predicate plus one deferred fixed-width + // output column still favors post-filter once selectivity is moderate: + // the saved output decode is smaller than the row-selection and cache + // overhead. Sparse projected predicates stay below this range. if self.projection_includes_all(&self.projection, &predicate_projection) && (CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO ..CostModelObservation::PROJECTED_PREDICATE_MAX_RATIO) From 30887835b7fa7d0a13698273224e0fe0be73ba99 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Tue, 26 May 2026 14:06:04 +0800 Subject: [PATCH 31/32] perf(parquet): gate projected predicate post-filter by deferred output cost --- parquet/benches/arrow_reader_row_filter.rs | 30 ++++++ parquet/src/arrow/push_decoder/mod.rs | 96 ++++++++++++++++++- .../push_decoder/reader_builder/cost_model.rs | 38 +++++++- .../arrow/push_decoder/reader_builder/mod.rs | 1 + 4 files changed, 159 insertions(+), 6 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 59e6361cac3a..077e3c9fd6b0 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -263,6 +263,8 @@ enum ProjectionCase { Float64AndTs, Float64Only, Int64AndFloat64, + Int64AndUtf8, + TsAndUtf8, Utf8Only, } @@ -277,6 +279,8 @@ impl std::fmt::Display for ProjectionCase { ProjectionCase::Float64AndTs => write!(f, "float64_and_ts"), ProjectionCase::Float64Only => write!(f, "float64_only"), ProjectionCase::Int64AndFloat64 => write!(f, "int64_and_float64"), + ProjectionCase::Int64AndUtf8 => write!(f, "int64_and_utf8"), + ProjectionCase::TsAndUtf8 => write!(f, "ts_and_utf8"), ProjectionCase::Utf8Only => write!(f, "utf8_only"), } } @@ -479,6 +483,10 @@ enum FilterType { /// subqueries. The selected rows are random and moderately selective, and /// benchmark projections cover both count-only and numeric aggregate cases. TpcdsQ9QuantityRange, + /// Exact shape for the projected-predicate moderate-selectivity gate: + /// a clustered 20% timestamp predicate where the predicate column is + /// projected and the deferred output is variable-width. + ProjectedTs20PctClustered, /// Very sparse projected fixed-width scan shaped like TPC-DS fact-table /// filters where the predicate column is also needed in the output projection. TpcdsSparseProjectedFactScan, @@ -524,6 +532,9 @@ impl std::fmt::Display for FilterType { "int64 < 40 projected predicate with fixed output" } FilterType::TpcdsQ9QuantityRange => "int64 > 0 AND int64 < 21", + FilterType::ProjectedTs20PctClustered => { + "ts < 2000 projected predicate with utf8 output" + } FilterType::TpcdsSparseProjectedFactScan => "ts % 1000 == 0", }; write!(f, "{s}") @@ -659,6 +670,10 @@ impl FilterType { let upper = lt(int64, &Int64Array::new_scalar(21))?; and(&lower, &upper) } + FilterType::ProjectedTs20PctClustered => { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(2000)) + } FilterType::TpcdsSparseProjectedFactScan => { let ts = batch .column(batch.schema().index_of("ts")?) @@ -698,6 +713,7 @@ impl FilterType { | FilterType::TpcdsQ2ProjectedPredicate30Pct | FilterType::TpcdsQ2ProjectedPredicate40Pct => &[0], FilterType::TpcdsQ9QuantityRange => &[0], + FilterType::ProjectedTs20PctClustered => &[3], FilterType::TpcdsSparseProjectedFactScan => &[3], } } @@ -1100,6 +1116,18 @@ fn benchmark_async_cost_model_focus(c: &mut Criterion) { FilterType::TpcdsQ2ProjectedPredicate20Pct, ProjectionCase::Int64AndFloat64, ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_20pct_varwidth_output", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate20Pct, + ProjectionCase::Int64AndUtf8, + ), + AsyncFocusCase::new( + "profile_projected_ts_20pct_varwidth_output", + parquet_file.clone(), + FilterType::ProjectedTs20PctClustered, + ProjectionCase::TsAndUtf8, + ), AsyncFocusCase::new( "profile_q2_projected_predicate_30pct", parquet_file.clone(), @@ -1392,6 +1420,8 @@ fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCa ProjectionCase::Float64AndTs => vec![1, 3], ProjectionCase::Float64Only => vec![1], ProjectionCase::Int64AndFloat64 => vec![0, 1], + ProjectionCase::Int64AndUtf8 => vec![0, 2], + ProjectionCase::TsAndUtf8 => vec![2, 3], ProjectionCase::Utf8Only => vec![2], } } diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 4fb303e89330..ad1b2c6ee683 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1803,7 +1803,8 @@ mod test { } #[test] - fn test_decoder_auto_cost_model_switches_for_projected_predicate_after_observation() { + fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_deferred_variable_width_output() + { let data = &COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); @@ -1839,13 +1840,68 @@ mod test { assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); - assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); - assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); assert_eq!( metrics.cost_model_projected_predicate_moderate_selectivity_count(), - Some(1) + Some(0) + ); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "deferred variable-width output should keep projected predicate pushdown" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_expensive_deferred_fixed_output( + ) { + let data = &WIDE_FIXED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 20)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b", "c", "d", "e"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_wide_fixed_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(0) ); - assert_eq!(metrics.records_read_from_cache(), Some(100)); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "expensive deferred fixed-width output should keep projected predicate pushdown" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); } #[test] @@ -3011,6 +3067,26 @@ mod test { static COST_MODEL_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); + static WIDE_FIXED_TEST_BATCH: LazyLock = LazyLock::new(|| { + let a: ArrayRef = Arc::new(Int64Array::from_iter_values(0..400)); + let b: ArrayRef = Arc::new(Int64Array::from_iter_values(400..800)); + let c: ArrayRef = Arc::new(Int64Array::from_iter_values(800..1200)); + let d: ArrayRef = Arc::new(Int64Array::from_iter_values(1200..1600)); + let e: ArrayRef = Arc::new(Int64Array::from_iter_values(1600..2000)); + + RecordBatch::try_from_iter(vec![ + ("a", a), + ("b", b), + ("c", c), + ("d", d), + ("e", e), + ]) + .unwrap() + }); + + static WIDE_FIXED_COST_MODEL_TEST_FILE_DATA: LazyLock = + LazyLock::new(|| write_batch_test_file(&WIDE_FIXED_TEST_BATCH, 100, 50)); + static NESTED_TEST_BATCH: LazyLock = LazyLock::new(|| { let tag: ArrayRef = Arc::new(StringViewArray::from_iter_values( (0..400).map(|idx| format!("tag_{}", idx % 7)), @@ -3390,6 +3466,16 @@ mod test { filter_record_batch(&projected, &filter).unwrap() } + fn expected_wide_fixed_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = WIDE_FIXED_TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + filter_record_batch(&batch, &filter).unwrap() + } + fn expected_c_every_other(offset: usize, len: usize) -> RecordBatch { let batch = TEST_BATCH.slice(offset, len); let filter = BooleanArray::from((0..len).map(|idx| idx % 2 == 0).collect::>()); diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs index 38040bff3651..ba5c80ddabac 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -297,6 +297,7 @@ impl RowGroupReaderBuilder { pub(super) fn observe_cost_model_candidate( &mut self, decision: RowSelectionStrategyDecision, + row_group_idx: usize, row_count: usize, budget: RowBudget, ) { @@ -330,7 +331,7 @@ impl RowGroupReaderBuilder { }; self.metrics.record_cost_model_observed_row_group(); - let reason = self.cost_model_reason_with_projection_context(observation); + let reason = self.cost_model_reason_with_projection_context(observation, row_group_idx); if matches!(reason, CostModelDecisionReason::ObservationIncomplete) { self.metrics.record_cost_model_trigger(reason); return; @@ -353,6 +354,7 @@ impl RowGroupReaderBuilder { fn cost_model_reason_with_projection_context( &self, observation: CostModelObservation, + row_group_idx: usize, ) -> CostModelDecisionReason { let reason = observation.trigger_reason(); if !matches!(reason, CostModelDecisionReason::PushdownStillPreferred) { @@ -376,6 +378,8 @@ impl RowGroupReaderBuilder { // the saved output decode is smaller than the row-selection and cache // overhead. Sparse projected predicates stay below this range. if self.projection_includes_all(&self.projection, &predicate_projection) + && self + .projected_predicate_deferred_output_is_cheap(row_group_idx, &predicate_projection) && (CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO ..CostModelObservation::PROJECTED_PREDICATE_MAX_RATIO) .contains(&selected_ratio) @@ -386,6 +390,38 @@ impl RowGroupReaderBuilder { } } + fn projected_predicate_deferred_output_is_cheap( + &self, + row_group_idx: usize, + predicate_projection: &ProjectionMask, + ) -> bool { + let row_group = self.metadata.row_group(row_group_idx); + if row_group.num_rows() == 0 { + return true; + } + + let mut deferred_uncompressed_bytes = 0u64; + let mut has_deferred_output = false; + for leaf_idx in 0..row_group.num_columns() { + if !self.projection.leaf_included(leaf_idx) + || predicate_projection.leaf_included(leaf_idx) + { + continue; + } + + has_deferred_output = true; + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + return false; + } + deferred_uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + !has_deferred_output + || deferred_uncompressed_bytes as f64 / row_group.num_rows() as f64 + <= Self::CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW + } + pub(super) fn post_filter_cost_model_supported(&self, budget: RowBudget) -> bool { let Some(filter) = self.filter.as_ref() else { return false; diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index 476f1ececc44..2d3c313ce4f9 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -1118,6 +1118,7 @@ impl RowGroupReaderBuilder { self.observe_cost_model_candidate( decision, + row_group_info.row_group_idx, row_group_info.row_count, row_group_info.budget, ); From 4d59bcd23297ea23761a439ce0aa63f1757cc167 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Wed, 27 May 2026 21:12:03 +0800 Subject: [PATCH 32/32] Format parquet tests after main merge --- parquet/src/arrow/push_decoder/mod.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index ad1b2c6ee683..8bdbe80744f8 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -1857,8 +1857,8 @@ mod test { } #[test] - fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_expensive_deferred_fixed_output( - ) { + fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_expensive_deferred_fixed_output() + { let data = &WIDE_FIXED_COST_MODEL_TEST_FILE_DATA; let builder = ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); @@ -1872,7 +1872,10 @@ mod test { let mut decoder = builder .with_batch_size(100) - .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b", "c", "d", "e"])) + .with_projection(ProjectionMask::columns( + &schema_descr, + ["a", "b", "c", "d", "e"], + )) .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) .with_metrics(metrics.clone()) @@ -3074,14 +3077,7 @@ mod test { let d: ArrayRef = Arc::new(Int64Array::from_iter_values(1200..1600)); let e: ArrayRef = Arc::new(Int64Array::from_iter_values(1600..2000)); - RecordBatch::try_from_iter(vec![ - ("a", a), - ("b", b), - ("c", c), - ("d", d), - ("e", e), - ]) - .unwrap() + RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c), ("d", d), ("e", e)]).unwrap() }); static WIDE_FIXED_COST_MODEL_TEST_FILE_DATA: LazyLock =