From fca1f63b0544e3ee38fd31fe0128a782ee94c248 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jun 2026 15:44:21 -0400 Subject: [PATCH 1/2] Document repetition / definition levels more --- .../array_reader/fixed_size_list_array.rs | 13 ++++ parquet/src/arrow/array_reader/list_array.rs | 17 ++++++ .../src/arrow/array_reader/list_view_array.rs | 5 +- parquet/src/arrow/array_reader/map_array.rs | 13 ++++ parquet/src/arrow/array_reader/mod.rs | 60 +++++++++++++++++++ .../src/arrow/array_reader/struct_array.rs | 13 ++++ 6 files changed, 120 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs index 518cd8625ed5..a0cbdfcbd0d2 100644 --- a/parquet/src/arrow/array_reader/fixed_size_list_array.rs +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -27,6 +27,19 @@ use arrow_data::{ArrayData, transform::MutableArrayData}; use arrow_schema::DataType as ArrowType; /// Implementation of fixed-size list array reader. +/// +/// Reconstructs a `FixedSizeList` from a child reader's definition and +/// repetition levels. See [`ArrayReader`] for how the `def_level` (`D`) and +/// `rep_level` (`R`) below are interpreted. +/// +/// The definition-level states are the same as [`ListArrayReader`], except +/// that: +/// +/// 1. A *present* row (`d >= D`) contributes exactly `fixed_size` child values. +/// 2. Empty (`d == D - 1`) and null (`d <= D - 2`) rows contribute no +/// child values and are null-padded to `fixed_size` on output. +/// +/// [`ListArrayReader`]: crate::arrow::array_reader::ListArrayReader pub struct FixedSizeListArrayReader { item_reader: Box, /// The number of child items in each row of the list array diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index e6c834096902..740a63f88b72 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -32,6 +32,23 @@ use std::marker::PhantomData; use std::sync::Arc; /// Implementation of list array reader. +/// +/// Reconstructs a `List`/`LargeList` from a child reader's definition and +/// repetition levels. See [`ArrayReader`] for how the `def_level` (`D`) and +/// `rep_level` (`R`) below are interpreted +/// +/// ```text +/// list at level D d >= D present, ≥1 value +/// d == D - 1 present but empty [] +/// d <= D - 2 null (nullable lists only) +/// ``` +/// +/// For example, with `D == 2` +/// * a column of `[10, null, 20]` +/// * null +/// * `[]` +/// +/// Produces child def levels `3, 2, 3`, then `0`, then `1`. pub struct ListArrayReader { item_reader: Box, data_type: ArrowType, diff --git a/parquet/src/arrow/array_reader/list_view_array.rs b/parquet/src/arrow/array_reader/list_view_array.rs index 357ab9dc14ae..c9ad3ccce0dc 100644 --- a/parquet/src/arrow/array_reader/list_view_array.rs +++ b/parquet/src/arrow/array_reader/list_view_array.rs @@ -24,7 +24,10 @@ use std::any::Any; use std::sync::Arc; /// Implementation of list view array reader. -/// This wraps a ListArrayReader and converts the result to ListViewArray. +/// +/// This wraps a [`ListArrayReader`] and converts the result to a +/// `ListView`/`LargeListView`. Definition and repetition levels are interpreted +/// identically to [`ListArrayReader`]. pub struct ListViewArrayReader { inner: ListArrayReader, data_type: ArrowType, diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 1639aca6293f..ae6cd8d005df 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -23,6 +23,19 @@ use std::any::Any; use std::sync::Arc; /// Implementation of a map array reader. +/// +/// A Parquet map is encoded as `List(Struct(key, value))`, so this reader is a +/// thin wrapper around a [`ListArrayReader`] whose item is a +/// [`StructArrayReader`]. +/// +/// See [`ArrayReader`] for how definition and repetition +/// levels are interpreted; the map's own `def_level`/`rep_level` drive the +/// outer list, while the key/value struct is one repetition level deeper: +/// +/// ```text +/// struct_rep_level = rep_level + 1 +/// struct_def_level = def_level + 1 (+2 when the map itself is nullable) +/// ``` pub struct MapArrayReader { data_type: ArrowType, reader: ListArrayReader, diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 989c93480993..dcc4a89b023d 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -85,6 +85,66 @@ pub use struct_array::StructArrayReader; /// /// Data can either be read in batches using [`ArrayReader::next_batch`] or /// incrementally using [`ArrayReader::read_records`] and [`ArrayReader::skip_records`]. +/// +/// # Definition and repetition levels +/// +/// Parquet encodes nesting, nulls, and empty lists using *definition* and +/// *repetition* levels, based on the [Dremel paper]. Some example nested +/// readers are: +/// * [`ListArrayReader`] +/// * [`FixedSizeListArrayReader`] +/// * [`MapArrayReader`] +/// * [`StructArrayReader`] +/// +/// Each nested reader accesses the levels via [`ArrayReader::get_def_levels`] +/// and [`ArrayReader::get_rep_levels`] and uses them to reconstruct nulls, +/// empty lists, and list boundaries. +/// +/// Each nested reader is built with a definition level `D` and a repetition +/// level `R` taken from its [`ParquetField`] (see its `def_level` / `rep_level` +/// fields). Given a child's level pair `(d, r)`, the two levels are interpreted +/// as follows. +/// +/// **Definition level** — how "present" the value is at this level: +/// +/// ```text +/// ┌───────────────────────────┬────────────────────────────────────┐ +/// │ State │ def level (d) │ +/// ├───────────────────────────┼────────────────────────────────────┤ +/// │ present, with a value │ d >= D │ +/// │ present but empty (list) │ d == D - 1 │ +/// │ null │ d <= D - 2 ← "lower still" │ +/// └───────────────────────────┴────────────────────────────────────┘ +/// ``` +/// +/// Note that not every reader uses all three states: +/// * a non-nullable list has no `null` row — only `d >= D` (has values) vs the +/// empty `d == D - 1`; +/// * a [`StructArrayReader`] has no `empty` row — only present `d >= D` vs null +/// `d < D`. +/// +/// **Repetition level** — where a value attaches relative to this reader's list: +/// +/// ```text +/// ┌──────────┬──────────────────────────────────────────────────────────┐ +/// │ r vs R │ meaning │ +/// ├──────────┼──────────────────────────────────────────────────────────┤ +/// │ r < R │ start of a new row at this level (outer/record boundary) │ +/// │ r == R │ another element appended to the current list row │ +/// │ r > R │ belongs to a more deeply nested child; already handled by │ +/// │ │ the child reader │ +/// └──────────┴──────────────────────────────────────────────────────────┘ +/// ``` +/// +/// # See Also +/// +/// See [`arrow_writer`] module for more details on how repetition and +/// definition levels are produced. +/// +/// [Dremel paper]: https://research.google/pubs/dremel-interactive-analysis-of-web-scale-datasets-2/ +/// [`arrow_writer`]: crate::arrow::arrow_writer +/// [`ParquetField`]: crate::arrow::schema::ParquetField +#[allow(rustdoc::private_intra_doc_links)] pub trait ArrayReader: Send { // TODO: this function is never used, and the trait is not public. Perhaps this should be // removed. diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index da92d410f32b..7d9c2df1ee9c 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -24,6 +24,19 @@ use std::any::Any; use std::sync::Arc; /// Implementation of struct array reader. +/// +/// Reconstructs a `Struct` from its children's definition and repetition +/// levels. See [`ArrayReader`] for how `struct_def_level` (`D`) and +/// `struct_rep_level` (`R`) below are interpreted. Structs only use two +/// definition states: +/// +/// ```text +/// d >= D struct row is valid +/// d < D struct row is null +/// ``` +/// +/// where `d` is the per-row minimum definition level across the children +/// (children share the same view of the parent, so the first child suffices). pub struct StructArrayReader { children: Vec>, data_type: ArrowType, From 44f13d1d600349e6d0a2d8c39cbf70facd0228c3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jun 2026 16:03:48 -0400 Subject: [PATCH 2/2] tweaks --- parquet/src/arrow/array_reader/list_array.rs | 16 ++++++++-------- parquet/src/arrow/array_reader/map_array.rs | 10 +++++----- parquet/src/arrow/array_reader/mod.rs | 10 +++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 740a63f88b72..1e6350fc6550 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -35,20 +35,20 @@ use std::sync::Arc; /// /// Reconstructs a `List`/`LargeList` from a child reader's definition and /// repetition levels. See [`ArrayReader`] for how the `def_level` (`D`) and -/// `rep_level` (`R`) below are interpreted +/// `rep_level` (`R`) below are interpreted. /// /// ```text -/// list at level D d >= D present, ≥1 value +/// list at level D d >= D present, ≥1 element /// d == D - 1 present but empty [] /// d <= D - 2 null (nullable lists only) /// ``` /// -/// For example, with `D == 2` -/// * a column of `[10, null, 20]` -/// * null -/// * `[]` -/// -/// Produces child def levels `3, 2, 3`, then `0`, then `1`. +/// For example, with `D == 2`, the rows: +/// * `[10, null, 20]` +/// * `null` +/// * `[]` +/// +/// produce child def levels `3, 2, 3`, then `0`, then `1`. pub struct ListArrayReader { item_reader: Box, data_type: ArrowType, diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index ae6cd8d005df..348ea25e9d3a 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -26,11 +26,11 @@ use std::sync::Arc; /// /// A Parquet map is encoded as `List(Struct(key, value))`, so this reader is a /// thin wrapper around a [`ListArrayReader`] whose item is a -/// [`StructArrayReader`]. -/// -/// See [`ArrayReader`] for how definition and repetition -/// levels are interpreted; the map's own `def_level`/`rep_level` drive the -/// outer list, while the key/value struct is one repetition level deeper: +/// [`StructArrayReader`]. +/// +/// See [`ArrayReader`] for how definition and repetition levels are +/// interpreted; the map's own `def_level`/`rep_level` drive the outer list, +/// while the key/value struct is one repetition level deeper: /// /// ```text /// struct_rep_level = rep_level + 1 diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index dcc4a89b023d..19e6417bd57e 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -118,10 +118,10 @@ pub use struct_array::StructArrayReader; /// ``` /// /// Note that not every reader uses all three states: -/// * a non-nullable list has no `null` row — only `d >= D` (has values) vs the -/// empty `d == D - 1`; -/// * a [`StructArrayReader`] has no `empty` row — only present `d >= D` vs null -/// `d < D`. +/// * a non-nullable list has no `null` state — only `d >= D` (has values) vs +/// the empty `d == D - 1`; +/// * a [`StructArrayReader`] has no `empty` state — only present `d >= D` vs +/// null `d < D`. /// /// **Repetition level** — where a value attaches relative to this reader's list: /// @@ -138,7 +138,7 @@ pub use struct_array::StructArrayReader; /// /// # See Also /// -/// See [`arrow_writer`] module for more details on how repetition and +/// See the [`arrow_writer`] module for more details on how repetition and /// definition levels are produced. /// /// [Dremel paper]: https://research.google/pubs/dremel-interactive-analysis-of-web-scale-datasets-2/