From ae0741b8d6f203d881de12f9f5d8f0d326f5093c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 21 Mar 2026 07:43:26 -0600 Subject: [PATCH 01/18] feat: add standalone shuffle benchmark binary for profiling Add a `shuffle_bench` binary that benchmarks shuffle write and read performance independently from Spark, making it easy to profile with tools like `cargo flamegraph`, `perf`, or `instruments`. Supports reading Parquet files (e.g. TPC-H/TPC-DS) or generating synthetic data with configurable schema. Covers different scenarios including compression codecs, partition counts, partitioning schemes, and memory-constrained spilling. --- native/Cargo.lock | 88 +++- native/core/Cargo.toml | 5 + native/core/src/bin/shuffle_bench.rs | 725 +++++++++++++++++++++++++++ 3 files changed, 816 insertions(+), 2 deletions(-) create mode 100644 native/core/src/bin/shuffle_bench.rs diff --git a/native/Cargo.lock b/native/Cargo.lock index 5f99c614b3..f43b41dd9a 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -96,12 +96,56 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -1331,6 +1375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -1339,8 +1384,22 @@ version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ + "anstream", "anstyle", "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -1358,6 +1417,12 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + [[package]] name = "combine" version = "4.6.7" @@ -1834,6 +1899,7 @@ dependencies = [ "aws-config", "aws-credential-types", "bytes", + "clap", "crc32fast", "criterion", "datafusion", @@ -1885,7 +1951,7 @@ dependencies = [ [[package]] name = "datafusion-comet-common" -version = "0.14.0" +version = "0.15.0" dependencies = [ "arrow", "datafusion", @@ -1911,7 +1977,7 @@ dependencies = [ [[package]] name = "datafusion-comet-jni-bridge" -version = "0.14.0" +version = "0.15.0" dependencies = [ "arrow", "assertables", @@ -3609,6 +3675,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.13.0" @@ -4289,6 +4361,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "oorandom" version = "11.1.5" @@ -6339,6 +6417,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.22.0" diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 3f305a631d..3df9e55719 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -72,6 +72,7 @@ url = { workspace = true } aws-config = { workspace = true } aws-credential-types = { workspace = true } parking_lot = "0.12.5" +clap = { version = "4", features = ["derive"] } datafusion-comet-objectstore-hdfs = { path = "../hdfs", optional = true, default-features = false, features = ["hdfs"] } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"] } object_store_opendal = {version = "0.55.0", optional = true} @@ -113,6 +114,10 @@ name = "comet" # "rlib" is for benchmarking with criterion. crate-type = ["cdylib", "rlib"] +[[bin]] +name = "shuffle_bench" +path = "src/bin/shuffle_bench.rs" + [[bench]] name = "parquet_read" harness = false diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs new file mode 100644 index 0000000000..c1498161f7 --- /dev/null +++ b/native/core/src/bin/shuffle_bench.rs @@ -0,0 +1,725 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Standalone shuffle benchmark tool for profiling Comet shuffle write and read +//! outside of Spark. +//! +//! # Usage +//! +//! Read from Parquet files (e.g. TPC-H lineitem): +//! ```sh +//! cargo run --release --bin shuffle_bench -- \ +//! --input /data/tpch-sf100/lineitem/ \ +//! --partitions 200 \ +//! --codec zstd --zstd-level 1 \ +//! --hash-columns 0,3 \ +//! --read-back +//! ``` +//! +//! Generate synthetic data: +//! ```sh +//! cargo run --release --bin shuffle_bench -- \ +//! --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \ +//! --gen-decimal-cols 2 --gen-avg-string-len 32 \ +//! --partitions 200 --codec lz4 --read-back +//! ``` +//! +//! Profile with flamegraph: +//! ```sh +//! cargo flamegraph --release --bin shuffle_bench -- \ +//! --input /data/tpch-sf100/lineitem/ \ +//! --partitions 200 --codec zstd --zstd-level 1 +//! ``` + +use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder}; +use arrow::array::RecordBatch; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use clap::Parser; +use comet::execution::shuffle::{ + read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec, +}; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; +use datafusion::execution::config::SessionConfig; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_plan::common::collect; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use rand::RngExt; +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +#[derive(Parser, Debug)] +#[command( + name = "shuffle_bench", + about = "Standalone benchmark for Comet shuffle write and read performance" +)] +struct Args { + /// Path to input Parquet file or directory of Parquet files + #[arg(long)] + input: Option, + + /// Generate synthetic data instead of reading from Parquet + #[arg(long, default_value_t = false)] + generate: bool, + + /// Number of rows to generate (requires --generate) + #[arg(long, default_value_t = 1_000_000)] + gen_rows: usize, + + /// Number of Int64 columns to generate + #[arg(long, default_value_t = 4)] + gen_int_cols: usize, + + /// Number of Utf8 string columns to generate + #[arg(long, default_value_t = 2)] + gen_string_cols: usize, + + /// Number of Decimal128 columns to generate + #[arg(long, default_value_t = 2)] + gen_decimal_cols: usize, + + /// Number of Date32 columns to generate + #[arg(long, default_value_t = 1)] + gen_date_cols: usize, + + /// Average string length for generated string columns + #[arg(long, default_value_t = 24)] + gen_avg_string_len: usize, + + /// Batch size for reading Parquet or generating data + #[arg(long, default_value_t = 8192)] + batch_size: usize, + + /// Number of output shuffle partitions + #[arg(long, default_value_t = 200)] + partitions: usize, + + /// Partitioning scheme: hash, single, round-robin + #[arg(long, default_value = "hash")] + partitioning: String, + + /// Column indices to hash on (comma-separated, e.g. "0,3") + #[arg(long, default_value = "0")] + hash_columns: String, + + /// Compression codec: none, lz4, zstd, snappy + #[arg(long, default_value = "zstd")] + codec: String, + + /// Zstd compression level (1-22) + #[arg(long, default_value_t = 1)] + zstd_level: i32, + + /// Memory limit in bytes (triggers spilling when exceeded) + #[arg(long)] + memory_limit: Option, + + /// Also benchmark reading back the shuffle output + #[arg(long, default_value_t = false)] + read_back: bool, + + /// Number of iterations to run + #[arg(long, default_value_t = 1)] + iterations: usize, + + /// Number of warmup iterations before timing + #[arg(long, default_value_t = 0)] + warmup: usize, + + /// Output directory for shuffle data/index files + #[arg(long, default_value = "/tmp/comet_shuffle_bench")] + output_dir: PathBuf, + + /// Write buffer size in bytes + #[arg(long, default_value_t = 1048576)] + write_buffer_size: usize, +} + +fn main() { + let args = Args::parse(); + + // Validate args + if args.input.is_none() && !args.generate { + eprintln!("Error: must specify either --input or --generate"); + std::process::exit(1); + } + + // Create output directory + fs::create_dir_all(&args.output_dir).expect("Failed to create output directory"); + + let data_file = args.output_dir.join("data.out"); + let index_file = args.output_dir.join("index.out"); + + // Load data + let load_start = Instant::now(); + let batches = if let Some(ref input_path) = args.input { + load_parquet(input_path, args.batch_size) + } else { + generate_data(&args) + }; + let load_elapsed = load_start.elapsed(); + + let schema = batches[0].schema(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum(); + + println!("=== Shuffle Benchmark ==="); + println!( + "Data source: {}", + if args.input.is_some() { + "parquet" + } else { + "generated" + } + ); + println!( + "Schema: {} columns ({} fields)", + schema.fields().len(), + describe_schema(&schema) + ); + println!("Total rows: {}", format_number(total_rows)); + println!("Total size: {}", format_bytes(total_bytes)); + println!("Batches: {}", batches.len()); + println!( + "Rows/batch: ~{}", + if batches.is_empty() { + 0 + } else { + total_rows / batches.len() + } + ); + println!("Load time: {:.3}s", load_elapsed.as_secs_f64()); + println!(); + + let codec = parse_codec(&args.codec, args.zstd_level); + let hash_col_indices = parse_hash_columns(&args.hash_columns); + + println!("Partitioning: {}", args.partitioning); + println!("Partitions: {}", args.partitions); + println!("Codec: {:?}", codec); + println!("Hash columns: {:?}", hash_col_indices); + if let Some(mem_limit) = args.memory_limit { + println!("Memory limit: {}", format_bytes(mem_limit)); + } + println!( + "Iterations: {} (warmup: {})", + args.iterations, args.warmup + ); + println!(); + + // Run warmup + timed iterations + let total_iters = args.warmup + args.iterations; + let mut write_times = Vec::with_capacity(args.iterations); + let mut read_times = Vec::with_capacity(args.iterations); + let mut data_file_sizes = Vec::with_capacity(args.iterations); + + for i in 0..total_iters { + let is_warmup = i < args.warmup; + let label = if is_warmup { + format!("warmup {}/{}", i + 1, args.warmup) + } else { + format!("iter {}/{}", i - args.warmup + 1, args.iterations) + }; + + // Write phase + let write_elapsed = run_shuffle_write( + &batches, + &schema, + &codec, + &hash_col_indices, + &args, + data_file.to_str().unwrap(), + index_file.to_str().unwrap(), + ); + let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0); + + if !is_warmup { + write_times.push(write_elapsed); + data_file_sizes.push(data_size); + } + + print!(" [{label}] write: {:.3}s", write_elapsed); + print!(" output: {}", format_bytes(data_size as usize)); + + // Read phase + if args.read_back { + let read_elapsed = run_shuffle_read( + data_file.to_str().unwrap(), + index_file.to_str().unwrap(), + args.partitions, + ); + if !is_warmup { + read_times.push(read_elapsed); + } + print!(" read: {:.3}s", read_elapsed); + } + println!(); + } + + // Print summary + if args.iterations > 0 { + println!(); + println!("=== Results ==="); + + let avg_write = write_times.iter().sum::() / write_times.len() as f64; + let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; + let write_throughput_rows = total_rows as f64 / avg_write; + let write_throughput_bytes = total_bytes as f64 / avg_write; + let compression_ratio = if avg_data_size > 0 { + total_bytes as f64 / avg_data_size as f64 + } else { + 0.0 + }; + + println!("Write:"); + println!(" avg time: {:.3}s", avg_write); + if write_times.len() > 1 { + let min = write_times.iter().cloned().fold(f64::INFINITY, f64::min); + let max = write_times + .iter() + .cloned() + .fold(f64::NEG_INFINITY, f64::max); + println!(" min/max: {:.3}s / {:.3}s", min, max); + } + println!( + " throughput: {}/s ({} rows/s)", + format_bytes(write_throughput_bytes as usize), + format_number(write_throughput_rows as usize) + ); + println!( + " output size: {}", + format_bytes(avg_data_size as usize) + ); + println!(" compression: {:.2}x", compression_ratio); + + if !read_times.is_empty() { + let avg_read = read_times.iter().sum::() / read_times.len() as f64; + let read_throughput_bytes = avg_data_size as f64 / avg_read; + + println!("Read:"); + println!(" avg time: {:.3}s", avg_read); + if read_times.len() > 1 { + let min = read_times.iter().cloned().fold(f64::INFINITY, f64::min); + let max = read_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + println!(" min/max: {:.3}s / {:.3}s", min, max); + } + println!( + " throughput: {}/s (from compressed)", + format_bytes(read_throughput_bytes as usize) + ); + } + } + + // Cleanup + let _ = fs::remove_file(&data_file); + let _ = fs::remove_file(&index_file); +} + +fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec { + let mut batches = Vec::new(); + + let paths = if path.is_dir() { + let mut files: Vec = fs::read_dir(path) + .expect("Failed to read input directory") + .filter_map(|entry| { + let entry = entry.ok()?; + let p = entry.path(); + if p.extension().and_then(|e| e.to_str()) == Some("parquet") { + Some(p) + } else { + None + } + }) + .collect(); + files.sort(); + if files.is_empty() { + panic!("No .parquet files found in {}", path.display()); + } + files + } else { + vec![path.clone()] + }; + + for file_path in &paths { + let file = fs::File::open(file_path) + .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e)); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| { + panic!( + "Failed to read Parquet metadata from {}: {}", + file_path.display(), + e + ) + }); + let reader = builder + .with_batch_size(batch_size) + .build() + .unwrap_or_else(|e| { + panic!( + "Failed to build Parquet reader for {}: {}", + file_path.display(), + e + ) + }); + for batch_result in reader { + let batch = batch_result.unwrap_or_else(|e| { + panic!("Failed to read batch from {}: {}", file_path.display(), e) + }); + if batch.num_rows() > 0 { + batches.push(batch); + } + } + } + + if batches.is_empty() { + panic!("No data read from input"); + } + + println!( + "Loaded {} batches from {} file(s)", + batches.len(), + paths.len() + ); + batches +} + +fn generate_data(args: &Args) -> Vec { + let mut fields = Vec::new(); + let mut col_idx = 0; + + // Int64 columns + for _ in 0..args.gen_int_cols { + fields.push(Field::new( + format!("int_col_{col_idx}"), + DataType::Int64, + true, + )); + col_idx += 1; + } + // String columns + for _ in 0..args.gen_string_cols { + fields.push(Field::new( + format!("str_col_{col_idx}"), + DataType::Utf8, + true, + )); + col_idx += 1; + } + // Decimal columns + for _ in 0..args.gen_decimal_cols { + fields.push(Field::new( + format!("dec_col_{col_idx}"), + DataType::Decimal128(18, 2), + true, + )); + col_idx += 1; + } + // Date columns + for _ in 0..args.gen_date_cols { + fields.push(Field::new( + format!("date_col_{col_idx}"), + DataType::Date32, + true, + )); + col_idx += 1; + } + + let schema = Arc::new(Schema::new(fields)); + let mut batches = Vec::new(); + let mut rng = rand::rng(); + let mut remaining = args.gen_rows; + + while remaining > 0 { + let batch_rows = remaining.min(args.batch_size); + remaining -= batch_rows; + + let mut columns: Vec> = Vec::new(); + + // Int64 columns + for _ in 0..args.gen_int_cols { + let mut builder = Int64Builder::with_capacity(batch_rows); + for _ in 0..batch_rows { + if rng.random_range(0..100) < 5 { + builder.append_null(); + } else { + builder.append_value(rng.random_range(0..1_000_000i64)); + } + } + columns.push(Arc::new(builder.finish())); + } + // String columns + for _ in 0..args.gen_string_cols { + let mut builder = + StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len); + for _ in 0..batch_rows { + if rng.random_range(0..100) < 5 { + builder.append_null(); + } else { + let len = rng.random_range(1..args.gen_avg_string_len * 2); + let s: String = (0..len) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); + builder.append_value(&s); + } + } + columns.push(Arc::new(builder.finish())); + } + // Decimal columns + for _ in 0..args.gen_decimal_cols { + let mut builder = Decimal128Builder::with_capacity(batch_rows) + .with_precision_and_scale(18, 2) + .unwrap(); + for _ in 0..batch_rows { + if rng.random_range(0..100) < 5 { + builder.append_null(); + } else { + builder.append_value(rng.random_range(0..100_000_000i128)); + } + } + columns.push(Arc::new(builder.finish())); + } + // Date columns + for _ in 0..args.gen_date_cols { + let mut builder = Date32Builder::with_capacity(batch_rows); + for _ in 0..batch_rows { + if rng.random_range(0..100) < 5 { + builder.append_null(); + } else { + builder.append_value(rng.random_range(0..20000i32)); + } + } + columns.push(Arc::new(builder.finish())); + } + + let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap(); + batches.push(batch); + } + + println!( + "Generated {} batches ({} rows)", + batches.len(), + args.gen_rows + ); + batches +} + +fn run_shuffle_write( + batches: &[RecordBatch], + schema: &SchemaRef, + codec: &CompressionCodec, + hash_col_indices: &[usize], + args: &Args, + data_file: &str, + index_file: &str, +) -> f64 { + let partitioning = build_partitioning( + &args.partitioning, + args.partitions, + hash_col_indices, + schema, + ); + + let partitions = &[batches.to_vec()]; + let exec = ShuffleWriterExec::try_new( + Arc::new(DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(), + ))), + partitioning, + codec.clone(), + data_file.to_string(), + index_file.to_string(), + false, + args.write_buffer_size, + ) + .expect("Failed to create ShuffleWriterExec"); + + let config = SessionConfig::new().with_batch_size(args.batch_size); + let mut runtime_builder = RuntimeEnvBuilder::new(); + if let Some(mem_limit) = args.memory_limit { + runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0); + } + let runtime_env = Arc::new(runtime_builder.build().unwrap()); + let ctx = SessionContext::new_with_config_rt(config, runtime_env); + let task_ctx = ctx.task_ctx(); + + let start = Instant::now(); + let stream = exec.execute(0, task_ctx).unwrap(); + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(collect(stream)).unwrap(); + start.elapsed().as_secs_f64() +} + +fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 { + let start = Instant::now(); + + // Read index file to get partition offsets + let index_bytes = fs::read(index_file).expect("Failed to read index file"); + let num_offsets = index_bytes.len() / 8; + let offsets: Vec = (0..num_offsets) + .map(|i| { + let bytes: [u8; 8] = index_bytes[i * 8..(i + 1) * 8].try_into().unwrap(); + i64::from_le_bytes(bytes) + }) + .collect(); + + // Read data file + let data_bytes = fs::read(data_file).expect("Failed to read data file"); + + let mut total_rows = 0usize; + let mut total_batches = 0usize; + + // Decode each partition's data + for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) { + let start_offset = offsets[p] as usize; + let end_offset = offsets[p + 1] as usize; + + if start_offset >= end_offset { + continue; // Empty partition + } + + // Read all IPC blocks within this partition + let mut offset = start_offset; + while offset < end_offset { + // First 8 bytes: IPC length + let ipc_length = + u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize; + + // Skip 8-byte length prefix, then 8 bytes of field_count + codec header + let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length]; + let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block"); + total_rows += batch.num_rows(); + total_batches += 1; + + offset += 8 + ipc_length; + } + } + + let elapsed = start.elapsed().as_secs_f64(); + eprintln!( + " read back {} rows in {} batches from {} partitions", + format_number(total_rows), + total_batches, + num_partitions + ); + elapsed +} + +fn build_partitioning( + scheme: &str, + num_partitions: usize, + hash_col_indices: &[usize], + schema: &SchemaRef, +) -> CometPartitioning { + match scheme { + "single" => CometPartitioning::SinglePartition, + "round-robin" => CometPartitioning::RoundRobin(num_partitions, 0), + "hash" => { + let exprs: Vec> = hash_col_indices + .iter() + .map(|&idx| { + let field = schema.field(idx); + Arc::new(Column::new(field.name(), idx)) + as Arc + }) + .collect(); + CometPartitioning::Hash(exprs, num_partitions) + } + other => { + eprintln!("Unknown partitioning scheme: {other}. Using hash."); + build_partitioning("hash", num_partitions, hash_col_indices, schema) + } + } +} + +fn parse_codec(codec: &str, zstd_level: i32) -> CompressionCodec { + match codec.to_lowercase().as_str() { + "none" => CompressionCodec::None, + "lz4" => CompressionCodec::Lz4Frame, + "zstd" => CompressionCodec::Zstd(zstd_level), + "snappy" => CompressionCodec::Snappy, + other => { + eprintln!("Unknown codec: {other}. Using zstd."); + CompressionCodec::Zstd(zstd_level) + } + } +} + +fn parse_hash_columns(s: &str) -> Vec { + s.split(',') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().parse::().expect("Invalid column index")) + .collect() +} + +fn describe_schema(schema: &Schema) -> String { + let mut counts = std::collections::HashMap::new(); + for field in schema.fields() { + let type_name = match field.data_type() { + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => "int", + DataType::Float16 | DataType::Float32 | DataType::Float64 => "float", + DataType::Utf8 | DataType::LargeUtf8 => "string", + DataType::Boolean => "bool", + DataType::Date32 | DataType::Date64 => "date", + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => "decimal", + DataType::Timestamp(_, _) => "timestamp", + DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => "binary", + _ => "other", + }; + *counts.entry(type_name).or_insert(0) += 1; + } + let mut parts: Vec = counts + .into_iter() + .map(|(k, v)| format!("{}x{}", v, k)) + .collect(); + parts.sort(); + parts.join(", ") +} + +fn format_number(n: usize) -> String { + let s = n.to_string(); + let mut result = String::new(); + for (i, c) in s.chars().rev().enumerate() { + if i > 0 && i % 3 == 0 { + result.push(','); + } + result.push(c); + } + result.chars().rev().collect() +} + +fn format_bytes(bytes: usize) -> String { + if bytes >= 1024 * 1024 * 1024 { + format!("{:.2} GiB", bytes as f64 / (1024.0 * 1024.0 * 1024.0)) + } else if bytes >= 1024 * 1024 { + format!("{:.2} MiB", bytes as f64 / (1024.0 * 1024.0)) + } else if bytes >= 1024 { + format!("{:.2} KiB", bytes as f64 / 1024.0) + } else { + format!("{bytes} B") + } +} From 9b5b305cb125a76e22f55ac61ee4994ff9e9f484 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 21 Mar 2026 09:05:16 -0600 Subject: [PATCH 02/18] feat: add --limit option to shuffle benchmark (default 1M rows) --- native/core/src/bin/shuffle_bench.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs index c1498161f7..9b963c5803 100644 --- a/native/core/src/bin/shuffle_bench.rs +++ b/native/core/src/bin/shuffle_bench.rs @@ -152,6 +152,10 @@ struct Args { /// Write buffer size in bytes #[arg(long, default_value_t = 1048576)] write_buffer_size: usize, + + /// Maximum number of rows to use (default: 1,000,000) + #[arg(long, default_value_t = 1_000_000)] + limit: usize, } fn main() { @@ -178,6 +182,26 @@ fn main() { }; let load_elapsed = load_start.elapsed(); + // Apply row limit + let batches = { + let mut limited = Vec::new(); + let mut rows_so_far = 0usize; + for batch in batches { + if rows_so_far >= args.limit { + break; + } + let remaining = args.limit - rows_so_far; + if batch.num_rows() <= remaining { + rows_so_far += batch.num_rows(); + limited.push(batch); + } else { + limited.push(batch.slice(0, remaining)); + rows_so_far += remaining; + } + } + limited + }; + let schema = batches[0].schema(); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum(); From e1ab490c57b04387e4885284811ccf2ad993f4a6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 21 Mar 2026 09:19:01 -0600 Subject: [PATCH 03/18] perf: apply limit during parquet read to avoid scanning all files --- native/core/src/bin/shuffle_bench.rs | 43 ++++++++++++---------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs index 9b963c5803..17b1a9a6ff 100644 --- a/native/core/src/bin/shuffle_bench.rs +++ b/native/core/src/bin/shuffle_bench.rs @@ -176,32 +176,12 @@ fn main() { // Load data let load_start = Instant::now(); let batches = if let Some(ref input_path) = args.input { - load_parquet(input_path, args.batch_size) + load_parquet(input_path, args.batch_size, args.limit) } else { generate_data(&args) }; let load_elapsed = load_start.elapsed(); - // Apply row limit - let batches = { - let mut limited = Vec::new(); - let mut rows_so_far = 0usize; - for batch in batches { - if rows_so_far >= args.limit { - break; - } - let remaining = args.limit - rows_so_far; - if batch.num_rows() <= remaining { - rows_so_far += batch.num_rows(); - limited.push(batch); - } else { - limited.push(batch.slice(0, remaining)); - rows_so_far += remaining; - } - } - limited - }; - let schema = batches[0].schema(); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum(); @@ -358,8 +338,9 @@ fn main() { let _ = fs::remove_file(&index_file); } -fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec { +fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec { let mut batches = Vec::new(); + let mut total_rows = 0usize; let paths = if path.is_dir() { let mut files: Vec = fs::read_dir(path) @@ -383,7 +364,7 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec { vec![path.clone()] }; - for file_path in &paths { + 'outer: for file_path in &paths { let file = fs::File::open(file_path) .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e)); let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| { @@ -407,8 +388,19 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec { let batch = batch_result.unwrap_or_else(|e| { panic!("Failed to read batch from {}: {}", file_path.display(), e) }); - if batch.num_rows() > 0 { + if batch.num_rows() == 0 { + continue; + } + let remaining = limit - total_rows; + if batch.num_rows() <= remaining { + total_rows += batch.num_rows(); batches.push(batch); + } else { + batches.push(batch.slice(0, remaining)); + total_rows += remaining; + } + if total_rows >= limit { + break 'outer; } } } @@ -418,8 +410,9 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec { } println!( - "Loaded {} batches from {} file(s)", + "Loaded {} batches ({} rows) from {} file(s)", batches.len(), + format_number(total_rows), paths.len() ); batches From ca36cbd579d5c512e9617eb54cd052ec0d2c244d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 23 Mar 2026 06:39:47 -0700 Subject: [PATCH 04/18] chore: add comment explaining parquet/rand deps in shuffle crate --- native/shuffle/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml index 1a64a36ff2..1337ea3b15 100644 --- a/native/shuffle/Cargo.toml +++ b/native/shuffle/Cargo.toml @@ -43,6 +43,7 @@ itertools = "0.14.0" jni = "0.21" log = "0.4" lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] } +# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature) parquet = { workspace = true, optional = true } rand = { workspace = true, optional = true } simd-adler32 = "0.3.7" From 6e8bed28902e7256e1e1b3e914439e0a5abf3622 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 26 Mar 2026 07:13:41 -0700 Subject: [PATCH 05/18] perf: add max_buffered_batches config and stream shuffle bench from parquet - Add `spark.comet.exec.shuffle.maxBufferedBatches` config to limit the number of batches buffered before spilling, allowing earlier spilling to reduce peak memory usage on executors - Fix too-many-open-files: close spill file FD after each spill and reopen in append mode, rather than holding one FD open per partition - Refactor shuffle_bench to stream directly from Parquet instead of loading all input data into memory; remove synthetic data generation - Add --max-buffered-batches CLI arg to shuffle_bench - Add shuffle benchmark documentation to README --- .../scala/org/apache/comet/CometConf.scala | 12 + native/core/src/execution/planner.rs | 2 + native/proto/src/proto/operator.proto | 3 + native/shuffle/README.md | 43 ++ native/shuffle/benches/shuffle_writer.rs | 1 + native/shuffle/src/bin/shuffle_bench.rs | 432 +++++------------- .../src/partitioners/multi_partition.rs | 11 + native/shuffle/src/shuffle_writer.rs | 11 + .../shuffle/src/writers/partition_writer.rs | 29 +- .../shuffle/CometNativeShuffleWriter.scala | 2 + 10 files changed, 209 insertions(+), 337 deletions(-) diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index bfe90181ff..2ad9c1f609 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -534,6 +534,18 @@ object CometConf extends ShimCometConf { .checkValue(v => v > 0, "Write buffer size must be positive") .createWithDefault(1) + val COMET_SHUFFLE_MAX_BUFFERED_BATCHES: ConfigEntry[Int] = + conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.maxBufferedBatches") + .category(CATEGORY_SHUFFLE) + .doc("Maximum number of batches to buffer in memory before spilling to disk during " + + "native shuffle. Setting this to a small value causes earlier spilling, which reduces " + + "peak memory usage on executors at the cost of more disk I/O. " + + "The default value of 0 disables this limit and spills only when the memory pool is " + + "exhausted.") + .intConf + .checkValue(v => v >= 0, "Max buffered batches must be non-negative") + .createWithDefault(0) + val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( "spark.comet.shuffle.preferDictionary.ratio") .category(CATEGORY_SHUFFLE) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 5af31fcc22..b6e05fccd1 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1352,6 +1352,7 @@ impl PhysicalPlanner { }?; let write_buffer_size = writer.write_buffer_size as usize; + let max_buffered_batches = writer.max_buffered_batches as usize; let shuffle_writer = Arc::new(ShuffleWriterExec::try_new( Arc::clone(&child.native_plan), partitioning, @@ -1360,6 +1361,7 @@ impl PhysicalPlanner { writer.output_index_file.clone(), writer.tracing_enabled, write_buffer_size, + max_buffered_batches, )?); Ok(( diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 344b9f0f21..5e23aad061 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -294,6 +294,9 @@ message ShuffleWriter { // Size of the write buffer in bytes used when writing shuffle data to disk. // Larger values may improve write performance but use more memory. int32 write_buffer_size = 8; + // Maximum number of batches to buffer before spilling to disk. + // 0 means no limit (spill only when memory pool is exhausted). + int32 max_buffered_batches = 9; } message ParquetWriter { diff --git a/native/shuffle/README.md b/native/shuffle/README.md index 8fba6b0323..0333ddbe8e 100644 --- a/native/shuffle/README.md +++ b/native/shuffle/README.md @@ -23,3 +23,46 @@ This crate provides the shuffle writer and reader implementation for Apache Data of the [Apache DataFusion Comet] subproject. [Apache DataFusion Comet]: https://github.com/apache/datafusion-comet/ + +## Shuffle Benchmark Tool + +A standalone benchmark binary (`shuffle_bench`) is included for profiling shuffle write and read +performance outside of Spark. It streams input data directly from Parquet files. + +### Basic usage + +```sh +cargo run --release --features shuffle-bench --bin shuffle_bench -- \ + --input /data/tpch-sf100/lineitem/ \ + --partitions 200 \ + --codec zstd --zstd-level 1 \ + --hash-columns 0,3 +``` + +### Options + +| Option | Default | Description | +|---|---|---| +| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files | +| `--partitions` | `200` | Number of output shuffle partitions | +| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` | +| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) | +| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` | +| `--zstd-level` | `1` | Zstd compression level (1–22) | +| `--batch-size` | `8192` | Batch size for reading Parquet data | +| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded | +| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) | +| `--write-buffer-size` | `1048576` | Write buffer size in bytes | +| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) | +| `--iterations` | `1` | Number of timed iterations | +| `--warmup` | `0` | Number of warmup iterations before timing | +| `--read-back` | `false` | Also benchmark reading back the shuffle output | +| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files | + +### Profiling with flamegraph + +```sh +cargo flamegraph --release --features shuffle-bench --bin shuffle_bench -- \ + --input /data/tpch-sf100/lineitem/ \ + --partitions 200 --codec zstd --zstd-level 1 +``` diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs index 27abd919fa..8ff1f024d5 100644 --- a/native/shuffle/benches/shuffle_writer.rs +++ b/native/shuffle/benches/shuffle_writer.rs @@ -153,6 +153,7 @@ fn create_shuffle_writer_exec( "/tmp/index.out".to_string(), false, 1024 * 1024, + 0, // max_buffered_batches: no limit ) .unwrap() } diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index 373d0f92b9..0f04954ffa 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -16,11 +16,10 @@ // under the License. //! Standalone shuffle benchmark tool for profiling Comet shuffle write and read -//! outside of Spark. +//! outside of Spark. Streams input directly from Parquet files. //! //! # Usage //! -//! Read from Parquet files (e.g. TPC-H lineitem): //! ```sh //! cargo run --release --bin shuffle_bench -- \ //! --input /data/tpch-sf100/lineitem/ \ @@ -30,14 +29,6 @@ //! --read-back //! ``` //! -//! Generate synthetic data: -//! ```sh -//! cargo run --release --bin shuffle_bench -- \ -//! --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \ -//! --gen-decimal-cols 2 --gen-avg-string-len 32 \ -//! --partitions 200 --codec lz4 --read-back -//! ``` -//! //! Profile with flamegraph: //! ```sh //! cargo flamegraph --release --bin shuffle_bench -- \ @@ -45,23 +36,19 @@ //! --partitions 200 --codec zstd --zstd-level 1 //! ``` -use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder}; -use arrow::array::RecordBatch; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, SchemaRef}; use clap::Parser; -use datafusion::datasource::memory::MemorySourceConfig; -use datafusion::datasource::source::DataSourceExec; use datafusion::execution::config::SessionConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::physical_expr::expressions::Column; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::common::collect; use datafusion::physical_plan::ExecutionPlan; -use datafusion::prelude::SessionContext; +use datafusion::prelude::{ParquetReadOptions, SessionContext}; use datafusion_comet_shuffle::{ read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec, }; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use rand::RngExt; use std::fs; use std::path::PathBuf; use std::sync::Arc; @@ -75,37 +62,9 @@ use std::time::Instant; struct Args { /// Path to input Parquet file or directory of Parquet files #[arg(long)] - input: Option, - - /// Generate synthetic data instead of reading from Parquet - #[arg(long, default_value_t = false)] - generate: bool, - - /// Number of rows to generate (requires --generate) - #[arg(long, default_value_t = 1_000_000)] - gen_rows: usize, - - /// Number of Int64 columns to generate - #[arg(long, default_value_t = 4)] - gen_int_cols: usize, + input: PathBuf, - /// Number of Utf8 string columns to generate - #[arg(long, default_value_t = 2)] - gen_string_cols: usize, - - /// Number of Decimal128 columns to generate - #[arg(long, default_value_t = 2)] - gen_decimal_cols: usize, - - /// Number of Date32 columns to generate - #[arg(long, default_value_t = 1)] - gen_date_cols: usize, - - /// Average string length for generated string columns - #[arg(long, default_value_t = 24)] - gen_avg_string_len: usize, - - /// Batch size for reading Parquet or generating data + /// Batch size for reading Parquet data #[arg(long, default_value_t = 8192)] batch_size: usize, @@ -153,70 +112,37 @@ struct Args { #[arg(long, default_value_t = 1048576)] write_buffer_size: usize, - /// Maximum number of rows to use (default: 1,000,000) - #[arg(long, default_value_t = 1_000_000)] + /// Maximum number of batches to buffer before spilling (0 = no limit) + #[arg(long, default_value_t = 0)] + max_buffered_batches: usize, + + /// Limit rows processed per iteration (0 = no limit) + #[arg(long, default_value_t = 0)] limit: usize, } fn main() { let args = Args::parse(); - // Validate args - if args.input.is_none() && !args.generate { - eprintln!("Error: must specify either --input or --generate"); - std::process::exit(1); - } - // Create output directory fs::create_dir_all(&args.output_dir).expect("Failed to create output directory"); - let data_file = args.output_dir.join("data.out"); let index_file = args.output_dir.join("index.out"); - // Load data - let load_start = Instant::now(); - let batches = if let Some(ref input_path) = args.input { - load_parquet(input_path, args.batch_size, args.limit) - } else { - generate_data(&args) - }; - let load_elapsed = load_start.elapsed(); + let (schema, total_rows) = read_parquet_metadata(&args.input, args.limit); - let schema = batches[0].schema(); - let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); - let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum(); + let codec = parse_codec(&args.codec, args.zstd_level); + let hash_col_indices = parse_hash_columns(&args.hash_columns); println!("=== Shuffle Benchmark ==="); + println!("Input: {}", args.input.display()); println!( - "Data source: {}", - if args.input.is_some() { - "parquet" - } else { - "generated" - } - ); - println!( - "Schema: {} columns ({} fields)", + "Schema: {} columns ({})", schema.fields().len(), describe_schema(&schema) ); - println!("Total rows: {}", format_number(total_rows)); - println!("Total size: {}", format_bytes(total_bytes)); - println!("Batches: {}", batches.len()); - println!( - "Rows/batch: ~{}", - if batches.is_empty() { - 0 - } else { - total_rows / batches.len() - } - ); - println!("Load time: {:.3}s", load_elapsed.as_secs_f64()); - println!(); - - let codec = parse_codec(&args.codec, args.zstd_level); - let hash_col_indices = parse_hash_columns(&args.hash_columns); - + println!("Total rows: {}", format_number(total_rows as usize)); + println!("Batch size: {}", format_number(args.batch_size)); println!("Partitioning: {}", args.partitioning); println!("Partitions: {}", args.partitions); println!("Codec: {:?}", codec); @@ -224,13 +150,15 @@ fn main() { if let Some(mem_limit) = args.memory_limit { println!("Memory limit: {}", format_bytes(mem_limit)); } + if args.max_buffered_batches > 0 { + println!("Max buf batches:{}", args.max_buffered_batches); + } println!( "Iterations: {} (warmup: {})", args.iterations, args.warmup ); println!(); - // Run warmup + timed iterations let total_iters = args.warmup + args.iterations; let mut write_times = Vec::with_capacity(args.iterations); let mut read_times = Vec::with_capacity(args.iterations); @@ -244,9 +172,8 @@ fn main() { format!("iter {}/{}", i - args.warmup + 1, args.iterations) }; - // Write phase let write_elapsed = run_shuffle_write( - &batches, + &args.input, &schema, &codec, &hash_col_indices, @@ -264,7 +191,6 @@ fn main() { print!(" [{label}] write: {:.3}s", write_elapsed); print!(" output: {}", format_bytes(data_size as usize)); - // Read phase if args.read_back { let read_elapsed = run_shuffle_read( data_file.to_str().unwrap(), @@ -279,7 +205,6 @@ fn main() { println!(); } - // Print summary if args.iterations > 0 { println!(); println!("=== Results ==="); @@ -287,12 +212,6 @@ fn main() { let avg_write = write_times.iter().sum::() / write_times.len() as f64; let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; let write_throughput_rows = total_rows as f64 / avg_write; - let write_throughput_bytes = total_bytes as f64 / avg_write; - let compression_ratio = if avg_data_size > 0 { - total_bytes as f64 / avg_data_size as f64 - } else { - 0.0 - }; println!("Write:"); println!(" avg time: {:.3}s", avg_write); @@ -305,15 +224,13 @@ fn main() { println!(" min/max: {:.3}s / {:.3}s", min, max); } println!( - " throughput: {}/s ({} rows/s)", - format_bytes(write_throughput_bytes as usize), + " throughput: {} rows/s", format_number(write_throughput_rows as usize) ); println!( " output size: {}", format_bytes(avg_data_size as usize) ); - println!(" compression: {:.2}x", compression_ratio); if !read_times.is_empty() { let avg_read = read_times.iter().sum::() / read_times.len() as f64; @@ -333,38 +250,17 @@ fn main() { } } - // Cleanup let _ = fs::remove_file(&data_file); let _ = fs::remove_file(&index_file); } -fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec { - let mut batches = Vec::new(); - let mut total_rows = 0usize; +/// Read schema and total row count from Parquet metadata without loading any data. +fn read_parquet_metadata(path: &PathBuf, limit: usize) -> (SchemaRef, u64) { + let paths = collect_parquet_paths(path); + let mut schema = None; + let mut total_rows = 0u64; - let paths = if path.is_dir() { - let mut files: Vec = fs::read_dir(path) - .expect("Failed to read input directory") - .filter_map(|entry| { - let entry = entry.ok()?; - let p = entry.path(); - if p.extension().and_then(|e| e.to_str()) == Some("parquet") { - Some(p) - } else { - None - } - }) - .collect(); - files.sort(); - if files.is_empty() { - panic!("No .parquet files found in {}", path.display()); - } - files - } else { - vec![path.clone()] - }; - - 'outer: for file_path in &paths { + for file_path in &paths { let file = fs::File::open(file_path) .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e)); let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| { @@ -374,172 +270,44 @@ fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec= limit { - break 'outer; - } + if schema.is_none() { + schema = Some(Arc::clone(builder.schema())); + } + total_rows += builder.metadata().file_metadata().num_rows() as u64; + if limit > 0 && total_rows >= limit as u64 { + total_rows = total_rows.min(limit as u64); + break; } } - if batches.is_empty() { - panic!("No data read from input"); - } - - println!( - "Loaded {} batches ({} rows) from {} file(s)", - batches.len(), - format_number(total_rows), - paths.len() - ); - batches + (schema.expect("No parquet files found"), total_rows) } -fn generate_data(args: &Args) -> Vec { - let mut fields = Vec::new(); - let mut col_idx = 0; - - // Int64 columns - for _ in 0..args.gen_int_cols { - fields.push(Field::new( - format!("int_col_{col_idx}"), - DataType::Int64, - true, - )); - col_idx += 1; - } - // String columns - for _ in 0..args.gen_string_cols { - fields.push(Field::new( - format!("str_col_{col_idx}"), - DataType::Utf8, - true, - )); - col_idx += 1; - } - // Decimal columns - for _ in 0..args.gen_decimal_cols { - fields.push(Field::new( - format!("dec_col_{col_idx}"), - DataType::Decimal128(18, 2), - true, - )); - col_idx += 1; - } - // Date columns - for _ in 0..args.gen_date_cols { - fields.push(Field::new( - format!("date_col_{col_idx}"), - DataType::Date32, - true, - )); - col_idx += 1; - } - - let schema = Arc::new(Schema::new(fields)); - let mut batches = Vec::new(); - let mut rng = rand::rng(); - let mut remaining = args.gen_rows; - - while remaining > 0 { - let batch_rows = remaining.min(args.batch_size); - remaining -= batch_rows; - - let mut columns: Vec> = Vec::new(); - - // Int64 columns - for _ in 0..args.gen_int_cols { - let mut builder = Int64Builder::with_capacity(batch_rows); - for _ in 0..batch_rows { - if rng.random_range(0..100) < 5 { - builder.append_null(); - } else { - builder.append_value(rng.random_range(0..1_000_000i64)); - } - } - columns.push(Arc::new(builder.finish())); - } - // String columns - for _ in 0..args.gen_string_cols { - let mut builder = - StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len); - for _ in 0..batch_rows { - if rng.random_range(0..100) < 5 { - builder.append_null(); - } else { - let len = rng.random_range(1..args.gen_avg_string_len * 2); - let s: String = (0..len) - .map(|_| rng.random_range(b'a'..=b'z') as char) - .collect(); - builder.append_value(&s); - } - } - columns.push(Arc::new(builder.finish())); - } - // Decimal columns - for _ in 0..args.gen_decimal_cols { - let mut builder = Decimal128Builder::with_capacity(batch_rows) - .with_precision_and_scale(18, 2) - .unwrap(); - for _ in 0..batch_rows { - if rng.random_range(0..100) < 5 { - builder.append_null(); - } else { - builder.append_value(rng.random_range(0..100_000_000i128)); - } - } - columns.push(Arc::new(builder.finish())); - } - // Date columns - for _ in 0..args.gen_date_cols { - let mut builder = Date32Builder::with_capacity(batch_rows); - for _ in 0..batch_rows { - if rng.random_range(0..100) < 5 { - builder.append_null(); +fn collect_parquet_paths(path: &PathBuf) -> Vec { + if path.is_dir() { + let mut files: Vec = fs::read_dir(path) + .unwrap_or_else(|e| panic!("Failed to read directory {}: {}", path.display(), e)) + .filter_map(|entry| { + let p = entry.ok()?.path(); + if p.extension().and_then(|e| e.to_str()) == Some("parquet") { + Some(p) } else { - builder.append_value(rng.random_range(0..20000i32)); + None } - } - columns.push(Arc::new(builder.finish())); + }) + .collect(); + files.sort(); + if files.is_empty() { + panic!("No .parquet files found in {}", path.display()); } - - let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap(); - batches.push(batch); + files + } else { + vec![path.clone()] } - - println!( - "Generated {} batches ({} rows)", - batches.len(), - args.gen_rows - ); - batches } fn run_shuffle_write( - batches: &[RecordBatch], + input_path: &PathBuf, schema: &SchemaRef, codec: &CompressionCodec, hash_col_indices: &[usize], @@ -554,40 +322,61 @@ fn run_shuffle_write( schema, ); - let partitions = &[batches.to_vec()]; - let exec = ShuffleWriterExec::try_new( - Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(), - ))), - partitioning, - codec.clone(), - data_file.to_string(), - index_file.to_string(), - false, - args.write_buffer_size, - ) - .expect("Failed to create ShuffleWriterExec"); - - let config = SessionConfig::new().with_batch_size(args.batch_size); - let mut runtime_builder = RuntimeEnvBuilder::new(); - if let Some(mem_limit) = args.memory_limit { - runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0); - } - let runtime_env = Arc::new(runtime_builder.build().unwrap()); - let ctx = SessionContext::new_with_config_rt(config, runtime_env); - let task_ctx = ctx.task_ctx(); - - let start = Instant::now(); - let stream = exec.execute(0, task_ctx).unwrap(); let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(collect(stream)).unwrap(); - start.elapsed().as_secs_f64() + rt.block_on(async { + let config = SessionConfig::new().with_batch_size(args.batch_size); + let mut runtime_builder = RuntimeEnvBuilder::new(); + if let Some(mem_limit) = args.memory_limit { + runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0); + } + let runtime_env = Arc::new(runtime_builder.build().unwrap()); + let ctx = SessionContext::new_with_config_rt(config, runtime_env); + + let path_str = input_path.to_str().unwrap(); + let mut df = ctx + .read_parquet(path_str, ParquetReadOptions::default()) + .await + .expect("Failed to create Parquet scan"); + if args.limit > 0 { + df = df.limit(0, Some(args.limit)).unwrap(); + } + + let parquet_plan = df + .create_physical_plan() + .await + .expect("Failed to create physical plan"); + + // ShuffleWriterExec reads from a single input partition + let input: Arc = + if parquet_plan.properties().output_partitioning().partition_count() > 1 { + Arc::new(CoalescePartitionsExec::new(parquet_plan)) + } else { + parquet_plan + }; + + let exec = ShuffleWriterExec::try_new( + input, + partitioning, + codec.clone(), + data_file.to_string(), + index_file.to_string(), + false, + args.write_buffer_size, + args.max_buffered_batches, + ) + .expect("Failed to create ShuffleWriterExec"); + + let task_ctx = ctx.task_ctx(); + let start = Instant::now(); + let stream = exec.execute(0, task_ctx).unwrap(); + collect(stream).await.unwrap(); + start.elapsed().as_secs_f64() + }) } fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 { let start = Instant::now(); - // Read index file to get partition offsets let index_bytes = fs::read(index_file).expect("Failed to read index file"); let num_offsets = index_bytes.len() / 8; let offsets: Vec = (0..num_offsets) @@ -597,34 +386,27 @@ fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> }) .collect(); - // Read data file let data_bytes = fs::read(data_file).expect("Failed to read data file"); let mut total_rows = 0usize; let mut total_batches = 0usize; - // Decode each partition's data for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) { let start_offset = offsets[p] as usize; let end_offset = offsets[p + 1] as usize; if start_offset >= end_offset { - continue; // Empty partition + continue; } - // Read all IPC blocks within this partition let mut offset = start_offset; while offset < end_offset { - // First 8 bytes: IPC length let ipc_length = u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize; - - // Skip 8-byte length prefix, then 8 bytes of field_count + codec header let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length]; let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block"); total_rows += batch.num_rows(); total_batches += 1; - offset += 8 + ipc_length; } } @@ -686,7 +468,7 @@ fn parse_hash_columns(s: &str) -> Vec { .collect() } -fn describe_schema(schema: &Schema) -> String { +fn describe_schema(schema: &arrow::datatypes::Schema) -> String { let mut counts = std::collections::HashMap::new(); for field in schema.fields() { let type_name = match field.data_type() { diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs index 42290c5510..4cc09e9679 100644 --- a/native/shuffle/src/partitioners/multi_partition.rs +++ b/native/shuffle/src/partitioners/multi_partition.rs @@ -124,6 +124,8 @@ pub(crate) struct MultiPartitionShuffleRepartitioner { tracing_enabled: bool, /// Size of the write buffer in bytes write_buffer_size: usize, + /// Maximum number of batches to buffer before spilling (0 = no limit) + max_buffered_batches: usize, } impl MultiPartitionShuffleRepartitioner { @@ -140,6 +142,7 @@ impl MultiPartitionShuffleRepartitioner { codec: CompressionCodec, tracing_enabled: bool, write_buffer_size: usize, + max_buffered_batches: usize, ) -> datafusion::common::Result { let num_output_partitions = partitioning.partition_count(); assert_ne!( @@ -189,6 +192,7 @@ impl MultiPartitionShuffleRepartitioner { reservation, tracing_enabled, write_buffer_size, + max_buffered_batches, }) } @@ -397,6 +401,13 @@ impl MultiPartitionShuffleRepartitioner { partition_row_indices: &[u32], partition_starts: &[u32], ) -> datafusion::common::Result<()> { + // Spill before buffering if we've reached the configured batch count limit. + if self.max_buffered_batches > 0 + && self.buffered_batches.len() >= self.max_buffered_batches + { + self.spill()?; + } + let mut mem_growth: usize = input.get_array_memory_size(); let buffered_partition_idx = self.buffered_batches.len() as u32; self.buffered_batches.push(input); diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index e649aaac69..95a09610a4 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -68,6 +68,8 @@ pub struct ShuffleWriterExec { tracing_enabled: bool, /// Size of the write buffer in bytes write_buffer_size: usize, + /// Maximum number of batches to buffer before spilling (0 = no limit) + max_buffered_batches: usize, } impl ShuffleWriterExec { @@ -81,6 +83,7 @@ impl ShuffleWriterExec { output_index_file: String, tracing_enabled: bool, write_buffer_size: usize, + max_buffered_batches: usize, ) -> Result { let cache = PlanProperties::new( EquivalenceProperties::new(Arc::clone(&input.schema())), @@ -99,6 +102,7 @@ impl ShuffleWriterExec { codec, tracing_enabled, write_buffer_size, + max_buffered_batches, }) } } @@ -163,6 +167,7 @@ impl ExecutionPlan for ShuffleWriterExec { self.output_index_file.clone(), self.tracing_enabled, self.write_buffer_size, + self.max_buffered_batches, )?)), _ => panic!("ShuffleWriterExec wrong number of children"), } @@ -190,6 +195,7 @@ impl ExecutionPlan for ShuffleWriterExec { self.codec.clone(), self.tracing_enabled, self.write_buffer_size, + self.max_buffered_batches, ) .map_err(|e| ArrowError::ExternalError(Box::new(e))), ) @@ -210,6 +216,7 @@ async fn external_shuffle( codec: CompressionCodec, tracing_enabled: bool, write_buffer_size: usize, + max_buffered_batches: usize, ) -> Result { with_trace_async("external_shuffle", tracing_enabled, || async { let schema = input.schema(); @@ -238,6 +245,7 @@ async fn external_shuffle( codec, tracing_enabled, write_buffer_size, + max_buffered_batches, )?), }; @@ -362,6 +370,7 @@ mod test { CompressionCodec::Lz4Frame, false, 1024 * 1024, // write_buffer_size: 1MB default + 0, // max_buffered_batches: no limit ) .unwrap(); @@ -466,6 +475,7 @@ mod test { "/tmp/index.out".to_string(), false, 1024 * 1024, // write_buffer_size: 1MB default + 0, // max_buffered_batches: no limit ) .unwrap(); @@ -525,6 +535,7 @@ mod test { index_file.clone(), false, 1024 * 1024, + 0, // max_buffered_batches: no limit ) .unwrap(); diff --git a/native/shuffle/src/writers/partition_writer.rs b/native/shuffle/src/writers/partition_writer.rs index 48017871db..4de307de62 100644 --- a/native/shuffle/src/writers/partition_writer.rs +++ b/native/shuffle/src/writers/partition_writer.rs @@ -26,7 +26,6 @@ use std::fs::{File, OpenOptions}; struct SpillFile { temp_file: RefCountedTempFile, - file: File, } pub(crate) struct PartitionWriter { @@ -53,26 +52,28 @@ impl PartitionWriter { runtime: &RuntimeEnv, ) -> datafusion::common::Result<()> { if self.spill_file.is_none() { - // Spill file is not yet created, create it let spill_file = runtime .disk_manager .create_tmp_file("shuffle writer spill")?; - let spill_data = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(spill_file.path()) - .map_err(|e| { - DataFusionError::Execution(format!("Error occurred while spilling {e}")) - })?; + // Create the file (truncating any pre-existing content) + File::create(spill_file.path()).map_err(|e| { + DataFusionError::Execution(format!("Error occurred while spilling {e}")) + })?; self.spill_file = Some(SpillFile { temp_file: spill_file, - file: spill_data, }); } Ok(()) } + fn open_spill_file_for_append(&self) -> datafusion::common::Result { + OpenOptions::new() + .write(true) + .append(true) + .open(self.spill_file.as_ref().unwrap().temp_file.path()) + .map_err(|e| DataFusionError::Execution(format!("Error occurred while spilling {e}"))) + } + pub(crate) fn spill( &mut self, iter: &mut PartitionedBatchIterator, @@ -84,10 +85,13 @@ impl PartitionWriter { if let Some(batch) = iter.next() { self.ensure_spill_file_created(runtime)?; + // Open the file for this spill and close it when done, so we don't + // hold open one FD per partition across multiple spill events. + let mut spill_data = self.open_spill_file_for_append()?; let total_bytes_written = { let mut buf_batch_writer = BufBatchWriter::new( &mut self.shuffle_block_writer, - &mut self.spill_file.as_mut().unwrap().file, + &mut spill_data, write_buffer_size, batch_size, ); @@ -104,6 +108,7 @@ impl PartitionWriter { buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?; bytes_written }; + // spill_data is dropped here, closing the file descriptor Ok(total_bytes_written) } else { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index 3fc222bd19..a80d8b2fa4 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -192,6 +192,8 @@ class CometNativeShuffleWriter[K, V]( CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get) shuffleWriterBuilder.setWriteBufferSize( CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt) + shuffleWriterBuilder.setMaxBufferedBatches( + CometConf.COMET_SHUFFLE_MAX_BUFFERED_BATCHES.get()) outputPartitioning match { case p if isSinglePartitioning(p) => From 2ef57e79d4e9202d4fb1a70f62f5d8b717ca67b4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Mar 2026 12:05:23 -0600 Subject: [PATCH 06/18] cargo fmt --- native/shuffle/src/bin/shuffle_bench.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index 7f919e7674..ec1227a2ee 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -340,12 +340,16 @@ fn run_shuffle_write( .expect("Failed to create physical plan"); // ShuffleWriterExec reads from a single input partition - let input: Arc = - if parquet_plan.properties().output_partitioning().partition_count() > 1 { - Arc::new(CoalescePartitionsExec::new(parquet_plan)) - } else { - parquet_plan - }; + let input: Arc = if parquet_plan + .properties() + .output_partitioning() + .partition_count() + > 1 + { + Arc::new(CoalescePartitionsExec::new(parquet_plan)) + } else { + parquet_plan + }; let exec = ShuffleWriterExec::try_new( input, From 9136e104e87cae0c8920fcce790064de5716d425 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Mar 2026 12:08:20 -0600 Subject: [PATCH 07/18] prettier --- native/shuffle/README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/native/shuffle/README.md b/native/shuffle/README.md index 0333ddbe8e..74b8dbe656 100644 --- a/native/shuffle/README.md +++ b/native/shuffle/README.md @@ -41,23 +41,23 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \ ### Options -| Option | Default | Description | -|---|---|---| -| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files | -| `--partitions` | `200` | Number of output shuffle partitions | -| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` | -| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) | -| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` | -| `--zstd-level` | `1` | Zstd compression level (1–22) | -| `--batch-size` | `8192` | Batch size for reading Parquet data | -| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded | -| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) | -| `--write-buffer-size` | `1048576` | Write buffer size in bytes | -| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) | -| `--iterations` | `1` | Number of timed iterations | -| `--warmup` | `0` | Number of warmup iterations before timing | -| `--read-back` | `false` | Also benchmark reading back the shuffle output | -| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files | +| Option | Default | Description | +| ------------------------ | -------------------------- | ------------------------------------------------------------ | +| `--input` | _(required)_ | Path to a Parquet file or directory of Parquet files | +| `--partitions` | `200` | Number of output shuffle partitions | +| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` | +| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) | +| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` | +| `--zstd-level` | `1` | Zstd compression level (1–22) | +| `--batch-size` | `8192` | Batch size for reading Parquet data | +| `--memory-limit` | _(none)_ | Memory limit in bytes; triggers spilling when exceeded | +| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) | +| `--write-buffer-size` | `1048576` | Write buffer size in bytes | +| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) | +| `--iterations` | `1` | Number of timed iterations | +| `--warmup` | `0` | Number of warmup iterations before timing | +| `--read-back` | `false` | Also benchmark reading back the shuffle output | +| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files | ### Profiling with flamegraph From 7e16819013e744eecd548483fcf414798f071744 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Mar 2026 12:19:44 -0600 Subject: [PATCH 08/18] machete --- native/Cargo.lock | 1 - native/shuffle/Cargo.toml | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 88007af70a..2c117631a4 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1996,7 +1996,6 @@ dependencies = [ "log", "lz4_flex 0.13.0", "parquet", - "rand 0.10.0", "simd-adler32", "snap", "tempfile", diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml index 14ed5f60a0..a5982c05fa 100644 --- a/native/shuffle/Cargo.toml +++ b/native/shuffle/Cargo.toml @@ -43,9 +43,8 @@ itertools = "0.14.0" jni = "0.21" log = "0.4" lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] } -# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature) +# parquet is only used by the shuffle_bench binary (shuffle-bench feature) parquet = { workspace = true, optional = true } -rand = { workspace = true, optional = true } simd-adler32 = "0.3.9" snap = "1.1" tokio = { version = "1", features = ["rt-multi-thread"] } @@ -58,7 +57,7 @@ itertools = "0.14.0" tempfile = "3.26.0" [features] -shuffle-bench = ["clap", "parquet", "rand"] +shuffle-bench = ["clap", "parquet"] [lib] name = "datafusion_comet_shuffle" From e7a3661a1107407c8fe3f3941b6539e48e82c67b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 12:32:12 -0600 Subject: [PATCH 09/18] feat: add immediate-mode shuffle partitioner Add ImmediateShufflePartitioner that repartitions each incoming batch using arrow take and writes per-partition data directly to individual temp files, avoiding in-memory buffering of input batches. At shuffle_write time, per-partition files are concatenated into the final shuffle data file and index. Extract ScratchSpace and partition ID computation (hash, range, round-robin) into shared scratch module, eliminating duplication between multi_partition and immediate_partition implementations. Configurable via spark.comet.exec.shuffle.nativeMode (default/immediate). --- .../scala/org/apache/comet/CometConf.scala | 14 + native/core/src/execution/planner.rs | 10 +- native/proto/src/proto/operator.proto | 8 + native/shuffle/benches/shuffle_writer.rs | 3 +- native/shuffle/src/lib.rs | 12 + .../src/partitioners/immediate_partition.rs | 278 ++++++++++++++++++ native/shuffle/src/partitioners/mod.rs | 3 + .../src/partitioners/multi_partition.rs | 272 ++--------------- native/shuffle/src/partitioners/scratch.rs | 183 ++++++++++++ native/shuffle/src/shuffle_writer.rs | 28 +- .../shuffle/CometNativeShuffleWriter.scala | 6 + 11 files changed, 557 insertions(+), 260 deletions(-) create mode 100644 native/shuffle/src/partitioners/immediate_partition.rs create mode 100644 native/shuffle/src/partitioners/scratch.rs diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index bfe90181ff..617f9131eb 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -534,6 +534,20 @@ object CometConf extends ShimCometConf { .checkValue(v => v > 0, "Write buffer size must be positive") .createWithDefault(1) + val COMET_NATIVE_SHUFFLE_MODE: ConfigEntry[String] = + conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.nativeMode") + .category(CATEGORY_SHUFFLE) + .doc( + "Selects which native shuffle implementation to use for multi-partition shuffles. " + + "'default' buffers input batches and tracks per-partition row indices, writing all " + + "partitions at the end with memory-pressure-driven spilling. " + + "'immediate' repartitions each incoming batch using take and writes per-partition " + + "data directly to individual files, avoiding in-memory buffering of input batches. " + + "The 'immediate' mode is experimental.") + .stringConf + .checkValues(Set("default", "immediate")) + .createWithDefault("default") + val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( "spark.comet.shuffle.preferDictionary.ratio") .category(CATEGORY_SHUFFLE) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 5af31fcc22..7680fd37a1 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -29,7 +29,7 @@ use crate::execution::{ planner::expression_registry::ExpressionRegistry, planner::operator_registry::OperatorRegistry, serde::to_arrow_datatype, - shuffle::ShuffleWriterExec, + shuffle::{ShuffleMode, ShuffleWriterExec}, }; use arrow::compute::CastOptions; use arrow::datatypes::{DataType, Field, FieldRef, Schema, TimeUnit, DECIMAL128_MAX_PRECISION}; @@ -116,7 +116,8 @@ use datafusion_comet_proto::{ spark_operator::{ self, lower_window_frame_bound::LowerFrameBoundStruct, operator::OpStruct, upper_window_frame_bound::UpperFrameBoundStruct, BuildSide, - CompressionCodec as SparkCompressionCodec, JoinType, Operator, WindowFrameType, + CompressionCodec as SparkCompressionCodec, JoinType, Operator, + ShuffleMode as SparkShuffleMode, WindowFrameType, }, spark_partitioning::{partitioning::PartitioningStruct, Partitioning as SparkPartitioning}, }; @@ -1352,6 +1353,10 @@ impl PhysicalPlanner { }?; let write_buffer_size = writer.write_buffer_size as usize; + let shuffle_mode = match writer.shuffle_mode.try_into() { + Ok(SparkShuffleMode::ImmediateShuffle) => ShuffleMode::Immediate, + _ => ShuffleMode::Default, + }; let shuffle_writer = Arc::new(ShuffleWriterExec::try_new( Arc::clone(&child.native_plan), partitioning, @@ -1360,6 +1365,7 @@ impl PhysicalPlanner { writer.output_index_file.clone(), writer.tracing_enabled, write_buffer_size, + shuffle_mode, )?); Ok(( diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 344b9f0f21..0ce5ae2b34 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -284,6 +284,12 @@ enum CompressionCodec { Snappy = 3; } +// Selects which shuffle implementation to use for multi-partition shuffles. +enum ShuffleMode { + DefaultShuffle = 0; + ImmediateShuffle = 1; +} + message ShuffleWriter { spark.spark_partitioning.Partitioning partitioning = 1; string output_data_file = 3; @@ -294,6 +300,8 @@ message ShuffleWriter { // Size of the write buffer in bytes used when writing shuffle data to disk. // Larger values may improve write performance but use more memory. int32 write_buffer_size = 8; + // Which shuffle implementation to use for multi-partition shuffles. + ShuffleMode shuffle_mode = 9; } message ParquetWriter { diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs index 27abd919fa..0b05190aac 100644 --- a/native/shuffle/benches/shuffle_writer.rs +++ b/native/shuffle/benches/shuffle_writer.rs @@ -30,7 +30,7 @@ use datafusion::{ prelude::SessionContext, }; use datafusion_comet_shuffle::{ - CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleWriterExec, + CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleMode, ShuffleWriterExec, }; use itertools::Itertools; use std::io::Cursor; @@ -153,6 +153,7 @@ fn create_shuffle_writer_exec( "/tmp/index.out".to_string(), false, 1024 * 1024, + ShuffleMode::Default, ) .unwrap() } diff --git a/native/shuffle/src/lib.rs b/native/shuffle/src/lib.rs index f29588f2e1..4f85e2844b 100644 --- a/native/shuffle/src/lib.rs +++ b/native/shuffle/src/lib.rs @@ -27,3 +27,15 @@ pub use comet_partitioning::CometPartitioning; pub use ipc::read_ipc_compressed; pub use shuffle_writer::ShuffleWriterExec; pub use writers::{CompressionCodec, ShuffleBlockWriter}; + +/// Selects which shuffle implementation to use for multi-partition shuffles. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ShuffleMode { + /// Default: buffer input batches, track per-partition row indices, and write + /// all partitions at the end with memory-pressure-driven spilling. + Default, + /// Experimental: repartition each incoming batch immediately using `take` and + /// write per-partition data directly to individual spill files, avoiding + /// in-memory buffering of input batches. + Immediate, +} diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs new file mode 100644 index 0000000000..98c20c75f9 --- /dev/null +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An immediate-mode shuffle partitioner that repartitions incoming batches and writes +//! directly to per-partition spill files. Unlike `MultiPartitionShuffleRepartitioner`, +//! this implementation does not buffer input batches in memory — it uses Arrow `take` +//! to extract per-partition slices and writes them immediately through per-partition +//! `BufBatchWriter`s. At `shuffle_write` time, the per-partition files are concatenated +//! into the final shuffle data file and index. + +use crate::metrics::ShufflePartitionerMetrics; +use crate::partitioners::scratch::ScratchSpace; +use crate::partitioners::ShufflePartitioner; +use crate::writers::BufBatchWriter; +use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter}; +use arrow::array::{ArrayRef, RecordBatch, UInt32Array}; +use arrow::compute::take; +use arrow::datatypes::SchemaRef; +use datafusion::common::DataFusionError; +use datafusion::execution::disk_manager::RefCountedTempFile; +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion_comet_common::tracing::{with_trace, with_trace_async}; +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::fs::{File, OpenOptions}; +use std::io::{BufReader, BufWriter, Seek, Write}; +use std::sync::Arc; +use tokio::time::Instant; + +struct PartitionFileWriter { + temp_file: RefCountedTempFile, + writer: BufBatchWriter, +} + +/// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using +/// `arrow::compute::take` and the per-partition slices are written directly to +/// per-partition temporary files via `BufBatchWriter`. No input batches are retained +/// in memory beyond the current one. +pub(crate) struct ImmediateShufflePartitioner { + output_data_file: String, + output_index_file: String, + partition_writers: Vec>, + shuffle_block_writer: ShuffleBlockWriter, + partitioning: CometPartitioning, + runtime: Arc, + metrics: ShufflePartitionerMetrics, + scratch: ScratchSpace, + batch_size: usize, + tracing_enabled: bool, + write_buffer_size: usize, +} + +impl ImmediateShufflePartitioner { + #[allow(clippy::too_many_arguments)] + pub(crate) fn try_new( + output_data_file: String, + output_index_file: String, + schema: SchemaRef, + partitioning: CometPartitioning, + metrics: ShufflePartitionerMetrics, + runtime: Arc, + batch_size: usize, + codec: CompressionCodec, + tracing_enabled: bool, + write_buffer_size: usize, + ) -> datafusion::common::Result { + let num_output_partitions = partitioning.partition_count(); + assert!( + num_output_partitions > 1, + "Use SinglePartitionShufflePartitioner for 1 output partition." + ); + + let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions); + let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?; + let partition_writers = (0..num_output_partitions).map(|_| None).collect(); + + Ok(Self { + output_data_file, + output_index_file, + partition_writers, + shuffle_block_writer, + partitioning, + runtime, + metrics, + scratch, + batch_size, + tracing_enabled, + write_buffer_size, + }) + } + + fn ensure_partition_writer(&mut self, partition_id: usize) -> datafusion::common::Result<()> { + if self.partition_writers[partition_id].is_none() { + let temp_file = self + .runtime + .disk_manager + .create_tmp_file(&format!("immediate_shuffle_partition_{partition_id}"))?; + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(temp_file.path()) + .map_err(|e| { + DataFusionError::Execution(format!("Error creating partition file: {e}")) + })?; + let writer = BufBatchWriter::new( + self.shuffle_block_writer.clone(), + file, + self.write_buffer_size, + self.batch_size, + ); + self.partition_writers[partition_id] = Some(PartitionFileWriter { temp_file, writer }); + } + Ok(()) + } + + fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> { + if input.num_rows() == 0 { + return Ok(()); + } + + if input.num_rows() > self.batch_size { + return Err(DataFusionError::Internal( + "Input batch size exceeds configured batch size. Call `insert_batch` instead." + .to_string(), + )); + } + + self.metrics.data_size.add(input.get_array_memory_size()); + self.metrics.baseline.record_output(input.num_rows()); + + let num_output_partitions = self.partitioning.partition_count(); + + let mut scratch = std::mem::take(&mut self.scratch); + { + let mut timer = self.metrics.repart_time.timer(); + scratch.compute_partition_ids(&self.partitioning, &input)?; + timer.stop(); + } + + for partition_id in 0..num_output_partitions { + let start = scratch.partition_starts[partition_id] as usize; + let end = scratch.partition_starts[partition_id + 1] as usize; + if start == end { + continue; + } + + let indices = UInt32Array::from_iter_values( + scratch.partition_row_indices[start..end].iter().copied(), + ); + let columns: Vec = input + .columns() + .iter() + .map(|col| { + take(col, &indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) + }) + .collect::>>()?; + let partition_batch = RecordBatch::try_new(input.schema(), columns)?; + + self.ensure_partition_writer(partition_id)?; + let pw = self.partition_writers[partition_id].as_mut().unwrap(); + pw.writer.write( + &partition_batch, + &self.metrics.encode_time, + &self.metrics.write_time, + )?; + } + + self.scratch = scratch; + Ok(()) + } +} + +#[async_trait::async_trait] +impl ShufflePartitioner for ImmediateShufflePartitioner { + async fn insert_batch(&mut self, batch: RecordBatch) -> datafusion::common::Result<()> { + with_trace_async( + "immediate_shuffle_insert_batch", + self.tracing_enabled, + || async { + let start_time = Instant::now(); + let mut start = 0; + while start < batch.num_rows() { + let end = (start + self.batch_size).min(batch.num_rows()); + let slice = batch.slice(start, end - start); + self.partitioning_batch(slice)?; + start = end; + } + self.metrics.input_batches.add(1); + self.metrics + .baseline + .elapsed_compute() + .add_duration(start_time.elapsed()); + Ok(()) + }, + ) + .await + } + + fn shuffle_write(&mut self) -> datafusion::common::Result<()> { + with_trace("immediate_shuffle_write", self.tracing_enabled, || { + let start_time = Instant::now(); + + let num_output_partitions = self.partition_writers.len(); + let mut offsets = vec![0u64; num_output_partitions + 1]; + + let output_data = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&self.output_data_file) + .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?; + + let mut output_data = BufWriter::new(output_data); + + for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() { + offsets[partition_id] = output_data.stream_position()?; + + if let Some(pw) = pw_slot { + pw.writer + .flush(&self.metrics.encode_time, &self.metrics.write_time)?; + + let mut spill_file = BufReader::new(File::open(pw.temp_file.path())?); + let mut write_timer = self.metrics.write_time.timer(); + std::io::copy(&mut spill_file, &mut output_data)?; + write_timer.stop(); + } + } + + let mut write_timer = self.metrics.write_time.timer(); + output_data.flush()?; + write_timer.stop(); + + offsets[num_output_partitions] = output_data.stream_position()?; + + let mut write_timer = self.metrics.write_time.timer(); + let mut output_index = + BufWriter::new(File::create(&self.output_index_file).map_err(|e| { + DataFusionError::Execution(format!("shuffle write error: {e:?}")) + })?); + for offset in offsets { + output_index.write_all(&(offset as i64).to_le_bytes()[..])?; + } + output_index.flush()?; + write_timer.stop(); + + self.metrics + .baseline + .elapsed_compute() + .add_duration(start_time.elapsed()); + + Ok(()) + }) + } +} + +impl Debug for ImmediateShufflePartitioner { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("ImmediateShufflePartitioner") + .field("num_partitions", &self.partition_writers.len()) + .finish() + } +} diff --git a/native/shuffle/src/partitioners/mod.rs b/native/shuffle/src/partitioners/mod.rs index 3eedef62c7..bb3d7b8c88 100644 --- a/native/shuffle/src/partitioners/mod.rs +++ b/native/shuffle/src/partitioners/mod.rs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +mod immediate_partition; mod multi_partition; mod partitioned_batch_iterator; +pub(crate) mod scratch; mod single_partition; mod traits; +pub(crate) use immediate_partition::ImmediateShufflePartitioner; pub(crate) use multi_partition::MultiPartitionShuffleRepartitioner; pub(crate) use partitioned_batch_iterator::PartitionedBatchIterator; pub(crate) use single_partition::SinglePartitionShufflePartitioner; diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs index 655bee3511..8a5007142c 100644 --- a/native/shuffle/src/partitioners/multi_partition.rs +++ b/native/shuffle/src/partitioners/multi_partition.rs @@ -19,10 +19,11 @@ use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::partitioned_batch_iterator::{ PartitionedBatchIterator, PartitionedBatchesProducer, }; +use crate::partitioners::scratch::ScratchSpace; use crate::partitioners::ShufflePartitioner; use crate::writers::{BufBatchWriter, PartitionWriter}; -use crate::{comet_partitioning, CometPartitioning, CompressionCodec, ShuffleBlockWriter}; -use arrow::array::{ArrayRef, RecordBatch}; +use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter}; +use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; use datafusion::common::utils::proxy::VecAllocExt; use datafusion::common::DataFusionError; @@ -30,7 +31,6 @@ use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::physical_plan::metrics::Time; use datafusion_comet_common::tracing::{with_trace, with_trace_async}; -use datafusion_comet_spark_expr::murmur3::create_murmur3_hashes; use itertools::Itertools; use std::fmt; use std::fmt::{Debug, Formatter}; @@ -39,71 +39,6 @@ use std::io::{BufReader, BufWriter, Seek, Write}; use std::sync::Arc; use tokio::time::Instant; -/// Reusable scratch buffers for computing row-to-partition assignments. -#[derive(Default)] -struct ScratchSpace { - /// Hashes for each row in the current batch. - hashes_buf: Vec, - /// Partition ids for each row in the current batch. - partition_ids: Vec, - /// The row indices of the rows in each partition. This array is conceptually divided into - /// partitions, where each partition contains the row indices of the rows in that partition. - /// The length of this array is the same as the number of rows in the batch. - partition_row_indices: Vec, - /// The start indices of partitions in partition_row_indices. partition_starts[K] and - /// partition_starts[K + 1] are the start and end indices of partition K in partition_row_indices. - /// The length of this array is 1 + the number of partitions. - partition_starts: Vec, -} - -impl ScratchSpace { - fn map_partition_ids_to_starts_and_indices( - &mut self, - num_output_partitions: usize, - num_rows: usize, - ) { - let partition_ids = &mut self.partition_ids[..num_rows]; - - // count each partition size, while leaving the last extra element as 0 - let partition_counters = &mut self.partition_starts; - partition_counters.resize(num_output_partitions + 1, 0); - partition_counters.fill(0); - partition_ids - .iter() - .for_each(|partition_id| partition_counters[*partition_id as usize] += 1); - - // accumulate partition counters into partition ends - // e.g. partition counter: [1, 3, 2, 1, 0] => [1, 4, 6, 7, 7] - let partition_ends = partition_counters; - let mut accum = 0; - partition_ends.iter_mut().for_each(|v| { - *v += accum; - accum = *v; - }); - - // calculate partition row indices and partition starts - // e.g. partition ids: [3, 1, 1, 1, 2, 2, 0] will produce the following partition_row_indices - // and partition_starts arrays: - // - // partition_row_indices: [6, 1, 2, 3, 4, 5, 0] - // partition_starts: [0, 1, 4, 6, 7] - // - // partition_starts conceptually splits partition_row_indices into smaller slices. - // Each slice partition_row_indices[partition_starts[K]..partition_starts[K + 1]] contains the - // row indices of the input batch that are partitioned into partition K. For example, - // first partition 0 has one row index [6], partition 1 has row indices [1, 2, 3], etc. - let partition_row_indices = &mut self.partition_row_indices; - partition_row_indices.resize(num_rows, 0); - for (index, partition_id) in partition_ids.iter().enumerate().rev() { - partition_ends[*partition_id as usize] -= 1; - let end = partition_ends[*partition_id as usize]; - partition_row_indices[end as usize] = index as u32; - } - - // after calculating, partition ends become partition starts - } -} - /// A partitioner that uses a hash function to partition data into multiple partitions pub(crate) struct MultiPartitionShuffleRepartitioner { output_data_file: String, @@ -148,22 +83,7 @@ impl MultiPartitionShuffleRepartitioner { "Use SinglePartitionShufflePartitioner for 1 output partition." ); - // Vectors in the scratch space will be filled with valid values before being used, this - // initialization code is simply initializing the vectors to the desired size. - // The initial values are not used. - let scratch = ScratchSpace { - hashes_buf: match partitioning { - // Allocate hashes_buf for hash and round robin partitioning. - // Round robin hashes all columns to achieve even, deterministic distribution. - CometPartitioning::Hash(_, _) | CometPartitioning::RoundRobin(_, _) => { - vec![0; batch_size] - } - _ => vec![], - }, - partition_ids: vec![0; batch_size], - partition_row_indices: vec![0; batch_size], - partition_starts: vec![0; num_output_partitions + 1], - }; + let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions); let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone())?; @@ -217,178 +137,20 @@ impl MultiPartitionShuffleRepartitioner { // number of rows those are written to output data file. self.metrics.baseline.record_output(input.num_rows()); - match &self.partitioning { - CometPartitioning::Hash(exprs, num_output_partitions) => { - let mut scratch = std::mem::take(&mut self.scratch); - let (partition_starts, partition_row_indices): (&Vec, &Vec) = { - let mut timer = self.metrics.repart_time.timer(); - - // Evaluate partition expressions to get rows to apply partitioning scheme. - let arrays = exprs - .iter() - .map(|expr| expr.evaluate(&input)?.into_array(input.num_rows())) - .collect::>>()?; - - let num_rows = arrays[0].len(); - - // Use identical seed as Spark hash partitioning. - let hashes_buf = &mut scratch.hashes_buf[..num_rows]; - hashes_buf.fill(42_u32); - - // Generate partition ids for every row. - { - // Hash arrays and compute partition ids based on number of partitions. - let partition_ids = &mut scratch.partition_ids[..num_rows]; - create_murmur3_hashes(&arrays, hashes_buf)? - .iter() - .enumerate() - .for_each(|(idx, hash)| { - partition_ids[idx] = - comet_partitioning::pmod(*hash, *num_output_partitions) as u32; - }); - } - - // We now have partition ids for every input row, map that to partition starts - // and partition indices to eventually right these rows to partition buffers. - scratch - .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows); - - timer.stop(); - Ok::<(&Vec, &Vec), DataFusionError>(( - &scratch.partition_starts, - &scratch.partition_row_indices, - )) - }?; - - self.buffer_partitioned_batch_may_spill( - input, - partition_row_indices, - partition_starts, - ) - .await?; - self.scratch = scratch; - } - CometPartitioning::RangePartitioning( - lex_ordering, - num_output_partitions, - row_converter, - bounds, - ) => { - let mut scratch = std::mem::take(&mut self.scratch); - let (partition_starts, partition_row_indices): (&Vec, &Vec) = { - let mut timer = self.metrics.repart_time.timer(); - - // Evaluate partition expressions for values to apply partitioning scheme on. - let arrays = lex_ordering - .iter() - .map(|expr| expr.expr.evaluate(&input)?.into_array(input.num_rows())) - .collect::>>()?; - - let num_rows = arrays[0].len(); - - // Generate partition ids for every row, first by converting the partition - // arrays to Rows, and then doing binary search for each Row against the - // bounds Rows. - { - let row_batch = row_converter.convert_columns(arrays.as_slice())?; - let partition_ids = &mut scratch.partition_ids[..num_rows]; - - row_batch.iter().enumerate().for_each(|(row_idx, row)| { - partition_ids[row_idx] = bounds - .as_slice() - .partition_point(|bound| bound.row() <= row) - as u32 - }); - } - - // We now have partition ids for every input row, map that to partition starts - // and partition indices to eventually right these rows to partition buffers. - scratch - .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows); - - timer.stop(); - Ok::<(&Vec, &Vec), DataFusionError>(( - &scratch.partition_starts, - &scratch.partition_row_indices, - )) - }?; - - self.buffer_partitioned_batch_may_spill( - input, - partition_row_indices, - partition_starts, - ) - .await?; - self.scratch = scratch; - } - CometPartitioning::RoundRobin(num_output_partitions, max_hash_columns) => { - // Comet implements "round robin" as hash partitioning on columns. - // This achieves the same goal as Spark's round robin (even distribution - // without semantic grouping) while being deterministic for fault tolerance. - // - // Note: This produces different partition assignments than Spark's round robin, - // which sorts by UnsafeRow binary representation before assigning partitions. - // However, both approaches provide even distribution and determinism. - let mut scratch = std::mem::take(&mut self.scratch); - let (partition_starts, partition_row_indices): (&Vec, &Vec) = { - let mut timer = self.metrics.repart_time.timer(); - - let num_rows = input.num_rows(); - - // Collect columns for hashing, respecting max_hash_columns limit - // max_hash_columns of 0 means no limit (hash all columns) - // Negative values are normalized to 0 in the planner - let num_columns_to_hash = if *max_hash_columns == 0 { - input.num_columns() - } else { - (*max_hash_columns).min(input.num_columns()) - }; - let columns_to_hash: Vec = (0..num_columns_to_hash) - .map(|i| Arc::clone(input.column(i))) - .collect(); - - // Use identical seed as Spark hash partitioning. - let hashes_buf = &mut scratch.hashes_buf[..num_rows]; - hashes_buf.fill(42_u32); - - // Compute hash for selected columns - create_murmur3_hashes(&columns_to_hash, hashes_buf)?; - - // Assign partition IDs based on hash (same as hash partitioning) - let partition_ids = &mut scratch.partition_ids[..num_rows]; - hashes_buf.iter().enumerate().for_each(|(idx, hash)| { - partition_ids[idx] = - comet_partitioning::pmod(*hash, *num_output_partitions) as u32; - }); - - // We now have partition ids for every input row, map that to partition starts - // and partition indices to eventually write these rows to partition buffers. - scratch - .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows); - - timer.stop(); - Ok::<(&Vec, &Vec), DataFusionError>(( - &scratch.partition_starts, - &scratch.partition_row_indices, - )) - }?; - - self.buffer_partitioned_batch_may_spill( - input, - partition_row_indices, - partition_starts, - ) - .await?; - self.scratch = scratch; - } - other => { - // this should be unreachable as long as the validation logic - // in the constructor is kept up-to-date - return Err(DataFusionError::NotImplemented(format!( - "Unsupported shuffle partitioning scheme {other:?}" - ))); - } + let mut scratch = std::mem::take(&mut self.scratch); + { + let mut timer = self.metrics.repart_time.timer(); + scratch.compute_partition_ids(&self.partitioning, &input)?; + timer.stop(); } + + self.buffer_partitioned_batch_may_spill( + input, + &scratch.partition_row_indices, + &scratch.partition_starts, + ) + .await?; + self.scratch = scratch; Ok(()) } diff --git a/native/shuffle/src/partitioners/scratch.rs b/native/shuffle/src/partitioners/scratch.rs new file mode 100644 index 0000000000..af0ede558f --- /dev/null +++ b/native/shuffle/src/partitioners/scratch.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Shared scratch buffers and partition ID computation for shuffle partitioners. + +use crate::{comet_partitioning, CometPartitioning}; +use arrow::array::{ArrayRef, RecordBatch}; +use datafusion::common::DataFusionError; +use datafusion_comet_spark_expr::murmur3::create_murmur3_hashes; +use std::sync::Arc; + +/// Reusable scratch buffers for computing row-to-partition assignments. +#[derive(Default)] +pub(crate) struct ScratchSpace { + /// Hashes for each row in the current batch. + pub(crate) hashes_buf: Vec, + /// Partition ids for each row in the current batch. + pub(crate) partition_ids: Vec, + /// The row indices of the rows in each partition. This array is conceptually divided into + /// partitions, where each partition contains the row indices of the rows in that partition. + /// The length of this array is the same as the number of rows in the batch. + pub(crate) partition_row_indices: Vec, + /// The start indices of partitions in partition_row_indices. partition_starts[K] and + /// partition_starts[K + 1] are the start and end indices of partition K in partition_row_indices. + /// The length of this array is 1 + the number of partitions. + pub(crate) partition_starts: Vec, +} + +impl ScratchSpace { + /// Create a new ScratchSpace with pre-allocated buffers for the given partitioning scheme. + pub(crate) fn new( + partitioning: &CometPartitioning, + batch_size: usize, + num_output_partitions: usize, + ) -> Self { + Self { + hashes_buf: match partitioning { + CometPartitioning::Hash(_, _) | CometPartitioning::RoundRobin(_, _) => { + vec![0; batch_size] + } + _ => vec![], + }, + partition_ids: vec![0; batch_size], + partition_row_indices: vec![0; batch_size], + partition_starts: vec![0; num_output_partitions + 1], + } + } + + pub(crate) fn map_partition_ids_to_starts_and_indices( + &mut self, + num_output_partitions: usize, + num_rows: usize, + ) { + let partition_ids = &mut self.partition_ids[..num_rows]; + + // count each partition size, while leaving the last extra element as 0 + let partition_counters = &mut self.partition_starts; + partition_counters.resize(num_output_partitions + 1, 0); + partition_counters.fill(0); + partition_ids + .iter() + .for_each(|partition_id| partition_counters[*partition_id as usize] += 1); + + // accumulate partition counters into partition ends + // e.g. partition counter: [1, 3, 2, 1, 0] => [1, 4, 6, 7, 7] + let partition_ends = partition_counters; + let mut accum = 0; + partition_ends.iter_mut().for_each(|v| { + *v += accum; + accum = *v; + }); + + // calculate partition row indices and partition starts + // e.g. partition ids: [3, 1, 1, 1, 2, 2, 0] will produce the following partition_row_indices + // and partition_starts arrays: + // + // partition_row_indices: [6, 1, 2, 3, 4, 5, 0] + // partition_starts: [0, 1, 4, 6, 7] + // + // partition_starts conceptually splits partition_row_indices into smaller slices. + // Each slice partition_row_indices[partition_starts[K]..partition_starts[K + 1]] contains the + // row indices of the input batch that are partitioned into partition K. For example, + // first partition 0 has one row index [6], partition 1 has row indices [1, 2, 3], etc. + let partition_row_indices = &mut self.partition_row_indices; + partition_row_indices.resize(num_rows, 0); + for (index, partition_id) in partition_ids.iter().enumerate().rev() { + partition_ends[*partition_id as usize] -= 1; + let end = partition_ends[*partition_id as usize]; + partition_row_indices[end as usize] = index as u32; + } + + // after calculating, partition ends become partition starts + } + + /// Compute partition IDs for the given batch and populate `partition_starts` and + /// `partition_row_indices`. Returns the number of rows processed. + pub(crate) fn compute_partition_ids( + &mut self, + partitioning: &CometPartitioning, + input: &RecordBatch, + ) -> datafusion::common::Result<()> { + match partitioning { + CometPartitioning::Hash(exprs, num_partitions) => { + let arrays = exprs + .iter() + .map(|expr| expr.evaluate(input)?.into_array(input.num_rows())) + .collect::>>()?; + let num_rows = arrays[0].len(); + let hashes_buf = &mut self.hashes_buf[..num_rows]; + hashes_buf.fill(42_u32); + let partition_ids = &mut self.partition_ids[..num_rows]; + create_murmur3_hashes(&arrays, hashes_buf)? + .iter() + .enumerate() + .for_each(|(idx, hash)| { + partition_ids[idx] = + comet_partitioning::pmod(*hash, *num_partitions) as u32; + }); + self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows); + } + CometPartitioning::RangePartitioning( + lex_ordering, + num_partitions, + row_converter, + bounds, + ) => { + let arrays = lex_ordering + .iter() + .map(|expr| expr.expr.evaluate(input)?.into_array(input.num_rows())) + .collect::>>()?; + let num_rows = arrays[0].len(); + let row_batch = row_converter.convert_columns(arrays.as_slice())?; + let partition_ids = &mut self.partition_ids[..num_rows]; + row_batch.iter().enumerate().for_each(|(row_idx, row)| { + partition_ids[row_idx] = bounds + .as_slice() + .partition_point(|bound| bound.row() <= row) + as u32 + }); + self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows); + } + CometPartitioning::RoundRobin(num_partitions, max_hash_columns) => { + let num_rows = input.num_rows(); + let num_columns_to_hash = if *max_hash_columns == 0 { + input.num_columns() + } else { + (*max_hash_columns).min(input.num_columns()) + }; + let columns_to_hash: Vec = (0..num_columns_to_hash) + .map(|i| Arc::clone(input.column(i))) + .collect(); + let hashes_buf = &mut self.hashes_buf[..num_rows]; + hashes_buf.fill(42_u32); + create_murmur3_hashes(&columns_to_hash, hashes_buf)?; + let partition_ids = &mut self.partition_ids[..num_rows]; + hashes_buf.iter().enumerate().for_each(|(idx, hash)| { + partition_ids[idx] = comet_partitioning::pmod(*hash, *num_partitions) as u32; + }); + self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows); + } + other => { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported shuffle partitioning scheme {other:?}" + ))); + } + } + Ok(()) + } +} diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index e649aaac69..9067777e3c 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -19,9 +19,10 @@ use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::{ - MultiPartitionShuffleRepartitioner, ShufflePartitioner, SinglePartitionShufflePartitioner, + ImmediateShufflePartitioner, MultiPartitionShuffleRepartitioner, ShufflePartitioner, + SinglePartitionShufflePartitioner, }; -use crate::{CometPartitioning, CompressionCodec}; +use crate::{CometPartitioning, CompressionCodec, ShuffleMode}; use async_trait::async_trait; use datafusion::common::exec_datafusion_err; use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; @@ -68,6 +69,8 @@ pub struct ShuffleWriterExec { tracing_enabled: bool, /// Size of the write buffer in bytes write_buffer_size: usize, + /// Which shuffle implementation to use for multi-partition shuffles + shuffle_mode: ShuffleMode, } impl ShuffleWriterExec { @@ -81,6 +84,7 @@ impl ShuffleWriterExec { output_index_file: String, tracing_enabled: bool, write_buffer_size: usize, + shuffle_mode: ShuffleMode, ) -> Result { let cache = PlanProperties::new( EquivalenceProperties::new(Arc::clone(&input.schema())), @@ -99,6 +103,7 @@ impl ShuffleWriterExec { codec, tracing_enabled, write_buffer_size, + shuffle_mode, }) } } @@ -163,6 +168,7 @@ impl ExecutionPlan for ShuffleWriterExec { self.output_index_file.clone(), self.tracing_enabled, self.write_buffer_size, + self.shuffle_mode, )?)), _ => panic!("ShuffleWriterExec wrong number of children"), } @@ -190,6 +196,7 @@ impl ExecutionPlan for ShuffleWriterExec { self.codec.clone(), self.tracing_enabled, self.write_buffer_size, + self.shuffle_mode, ) .map_err(|e| ArrowError::ExternalError(Box::new(e))), ) @@ -210,6 +217,7 @@ async fn external_shuffle( codec: CompressionCodec, tracing_enabled: bool, write_buffer_size: usize, + shuffle_mode: ShuffleMode, ) -> Result { with_trace_async("external_shuffle", tracing_enabled, || async { let schema = input.schema(); @@ -226,6 +234,20 @@ async fn external_shuffle( write_buffer_size, )?) } + _ if shuffle_mode == ShuffleMode::Immediate => { + Box::new(ImmediateShufflePartitioner::try_new( + output_data_file, + output_index_file, + Arc::clone(&schema), + partitioning, + metrics, + context.runtime_env(), + context.session_config().batch_size(), + codec, + tracing_enabled, + write_buffer_size, + )?) + } _ => Box::new(MultiPartitionShuffleRepartitioner::try_new( partition, output_data_file, @@ -466,6 +488,7 @@ mod test { "/tmp/index.out".to_string(), false, 1024 * 1024, // write_buffer_size: 1MB default + ShuffleMode::Default, ) .unwrap(); @@ -525,6 +548,7 @@ mod test { index_file.clone(), false, 1024 * 1024, + ShuffleMode::Default, ) .unwrap(); diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index 3fc222bd19..d53d5c965c 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -193,6 +193,12 @@ class CometNativeShuffleWriter[K, V]( shuffleWriterBuilder.setWriteBufferSize( CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt) + val shuffleMode = CometConf.COMET_NATIVE_SHUFFLE_MODE.get() match { + case "immediate" => OperatorOuterClass.ShuffleMode.ImmediateShuffle + case _ => OperatorOuterClass.ShuffleMode.DefaultShuffle + } + shuffleWriterBuilder.setShuffleMode(shuffleMode) + outputPartitioning match { case p if isSinglePartitioning(p) => val partitioning = PartitioningOuterClass.SinglePartition.newBuilder() From 4e6c02638e74f2e3a4219a940740d6ae185df54f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 13:16:42 -0600 Subject: [PATCH 10/18] perf: optimize immediate shuffle with single-take-then-slice Replace per-partition take calls (P*C = 3200 for 200 partitions, 16 columns) with a single take per column to reorder the entire batch, then zero-copy RecordBatch::slice per partition. Also remove BufReader wrapper in shuffle_write copy loop to enable kernel zero-copy on Linux. Add --shuffle-mode flag to shuffle_bench binary. Benchmark (10M rows, 200 partitions, hash on cols 0,3, zstd): default: 4.804s avg (2.08M rows/s) immediate: 4.185s avg (2.39M rows/s) -- 13% faster --- native/shuffle/src/bin/shuffle_bench.rs | 13 ++++++- .../src/partitioners/immediate_partition.rs | 37 +++++++++++-------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index ec1227a2ee..89be7c3bb8 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -46,7 +46,7 @@ use datafusion::physical_plan::common::collect; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{ParquetReadOptions, SessionContext}; use datafusion_comet_shuffle::{ - read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec, + read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleMode, ShuffleWriterExec, }; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use std::fs; @@ -115,6 +115,10 @@ struct Args { /// Limit rows processed per iteration (0 = no limit) #[arg(long, default_value_t = 0)] limit: usize, + + /// Shuffle mode: default or immediate + #[arg(long, default_value = "default")] + shuffle_mode: String, } fn main() { @@ -143,6 +147,7 @@ fn main() { println!("Partitions: {}", args.partitions); println!("Codec: {:?}", codec); println!("Hash columns: {:?}", hash_col_indices); + println!("Shuffle mode: {}", args.shuffle_mode); if let Some(mem_limit) = args.memory_limit { println!("Memory limit: {}", format_bytes(mem_limit)); } @@ -351,6 +356,11 @@ fn run_shuffle_write( parquet_plan }; + let shuffle_mode = match args.shuffle_mode.as_str() { + "immediate" => ShuffleMode::Immediate, + _ => ShuffleMode::Default, + }; + let exec = ShuffleWriterExec::try_new( input, partitioning, @@ -359,6 +369,7 @@ fn run_shuffle_write( index_file.to_string(), false, args.write_buffer_size, + shuffle_mode, ) .expect("Failed to create ShuffleWriterExec"); diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index 98c20c75f9..65fd746f0b 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -37,7 +37,7 @@ use datafusion_comet_common::tracing::{with_trace, with_trace_async}; use std::fmt; use std::fmt::{Debug, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{BufReader, BufWriter, Seek, Write}; +use std::io::{BufWriter, Seek, Write}; use std::sync::Arc; use tokio::time::Instant; @@ -152,6 +152,22 @@ impl ImmediateShufflePartitioner { timer.stop(); } + // Single take per column to reorder entire batch by partition assignment, + // then zero-copy slice per partition. This replaces P*C take calls with just C. + let num_rows = input.num_rows(); + let all_indices = UInt32Array::from_iter_values( + scratch.partition_row_indices[..num_rows].iter().copied(), + ); + let sorted_columns: Vec = input + .columns() + .iter() + .map(|col| { + take(col, &all_indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) + }) + .collect::>>()?; + let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?; + for partition_id in 0..num_output_partitions { let start = scratch.partition_starts[partition_id] as usize; let end = scratch.partition_starts[partition_id + 1] as usize; @@ -159,18 +175,7 @@ impl ImmediateShufflePartitioner { continue; } - let indices = UInt32Array::from_iter_values( - scratch.partition_row_indices[start..end].iter().copied(), - ); - let columns: Vec = input - .columns() - .iter() - .map(|col| { - take(col, &indices, None) - .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) - }) - .collect::>>()?; - let partition_batch = RecordBatch::try_new(input.schema(), columns)?; + let partition_batch = sorted_batch.slice(start, end - start); self.ensure_partition_writer(partition_id)?; let pw = self.partition_writers[partition_id].as_mut().unwrap(); @@ -231,11 +236,13 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() { offsets[partition_id] = output_data.stream_position()?; - if let Some(pw) = pw_slot { + if let Some(mut pw) = pw_slot.take() { pw.writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; - let mut spill_file = BufReader::new(File::open(pw.temp_file.path())?); + // Reopen for read without BufReader to allow kernel zero-copy + // (copy_file_range/sendfile) on supported platforms + let mut spill_file = File::open(pw.temp_file.path())?; let mut write_timer = self.metrics.write_time.timer(); std::io::copy(&mut spill_file, &mut output_data)?; write_timer.stop(); From 97b0fe065a3a23e68b6d24e8bb8b5cc7599a4fee Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 13:17:52 -0600 Subject: [PATCH 11/18] feat: make immediate shuffle mode the default --- common/src/main/scala/org/apache/comet/CometConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index 617f9131eb..3b50ccad1a 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -546,7 +546,7 @@ object CometConf extends ShimCometConf { "The 'immediate' mode is experimental.") .stringConf .checkValues(Set("default", "immediate")) - .createWithDefault("default") + .createWithDefault("immediate") val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( "spark.comet.shuffle.preferDictionary.ratio") From 7ccda152eeefe936d6347bd0500676ff6aa58623 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 13:21:10 -0600 Subject: [PATCH 12/18] refactor: rename Default shuffle mode to Buffered --- common/src/main/scala/org/apache/comet/CometConf.scala | 7 +++---- native/core/src/execution/planner.rs | 4 ++-- native/proto/src/proto/operator.proto | 2 +- native/shuffle/benches/shuffle_writer.rs | 2 +- native/shuffle/src/bin/shuffle_bench.rs | 8 ++++---- native/shuffle/src/lib.rs | 4 ++-- native/shuffle/src/shuffle_writer.rs | 4 ++-- .../execution/shuffle/CometNativeShuffleWriter.scala | 4 ++-- 8 files changed, 17 insertions(+), 18 deletions(-) diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index 3b50ccad1a..908c5638c1 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -539,13 +539,12 @@ object CometConf extends ShimCometConf { .category(CATEGORY_SHUFFLE) .doc( "Selects which native shuffle implementation to use for multi-partition shuffles. " + - "'default' buffers input batches and tracks per-partition row indices, writing all " + + "'buffered' buffers input batches and tracks per-partition row indices, writing all " + "partitions at the end with memory-pressure-driven spilling. " + "'immediate' repartitions each incoming batch using take and writes per-partition " + - "data directly to individual files, avoiding in-memory buffering of input batches. " + - "The 'immediate' mode is experimental.") + "data directly to individual files, avoiding in-memory buffering of input batches.") .stringConf - .checkValues(Set("default", "immediate")) + .checkValues(Set("buffered", "immediate")) .createWithDefault("immediate") val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 7680fd37a1..7b9bf5a3e5 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1354,8 +1354,8 @@ impl PhysicalPlanner { let write_buffer_size = writer.write_buffer_size as usize; let shuffle_mode = match writer.shuffle_mode.try_into() { - Ok(SparkShuffleMode::ImmediateShuffle) => ShuffleMode::Immediate, - _ => ShuffleMode::Default, + Ok(SparkShuffleMode::BufferedShuffle) => ShuffleMode::Buffered, + _ => ShuffleMode::Immediate, }; let shuffle_writer = Arc::new(ShuffleWriterExec::try_new( Arc::clone(&child.native_plan), diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 0ce5ae2b34..7f6d947315 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -286,7 +286,7 @@ enum CompressionCodec { // Selects which shuffle implementation to use for multi-partition shuffles. enum ShuffleMode { - DefaultShuffle = 0; + BufferedShuffle = 0; ImmediateShuffle = 1; } diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs index 0b05190aac..fcb2e592f1 100644 --- a/native/shuffle/benches/shuffle_writer.rs +++ b/native/shuffle/benches/shuffle_writer.rs @@ -153,7 +153,7 @@ fn create_shuffle_writer_exec( "/tmp/index.out".to_string(), false, 1024 * 1024, - ShuffleMode::Default, + ShuffleMode::Buffered, ) .unwrap() } diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index 89be7c3bb8..4dba0d8c41 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -116,8 +116,8 @@ struct Args { #[arg(long, default_value_t = 0)] limit: usize, - /// Shuffle mode: default or immediate - #[arg(long, default_value = "default")] + /// Shuffle mode: buffered or immediate + #[arg(long, default_value = "immediate")] shuffle_mode: String, } @@ -357,8 +357,8 @@ fn run_shuffle_write( }; let shuffle_mode = match args.shuffle_mode.as_str() { - "immediate" => ShuffleMode::Immediate, - _ => ShuffleMode::Default, + "buffered" => ShuffleMode::Buffered, + _ => ShuffleMode::Immediate, }; let exec = ShuffleWriterExec::try_new( diff --git a/native/shuffle/src/lib.rs b/native/shuffle/src/lib.rs index 4f85e2844b..fb961752f6 100644 --- a/native/shuffle/src/lib.rs +++ b/native/shuffle/src/lib.rs @@ -31,9 +31,9 @@ pub use writers::{CompressionCodec, ShuffleBlockWriter}; /// Selects which shuffle implementation to use for multi-partition shuffles. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ShuffleMode { - /// Default: buffer input batches, track per-partition row indices, and write + /// Buffered: buffer input batches, track per-partition row indices, and write /// all partitions at the end with memory-pressure-driven spilling. - Default, + Buffered, /// Experimental: repartition each incoming batch immediately using `take` and /// write per-partition data directly to individual spill files, avoiding /// in-memory buffering of input batches. diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index 9067777e3c..bf7dc33e20 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -488,7 +488,7 @@ mod test { "/tmp/index.out".to_string(), false, 1024 * 1024, // write_buffer_size: 1MB default - ShuffleMode::Default, + ShuffleMode::Buffered, ) .unwrap(); @@ -548,7 +548,7 @@ mod test { index_file.clone(), false, 1024 * 1024, - ShuffleMode::Default, + ShuffleMode::Buffered, ) .unwrap(); diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index d53d5c965c..13c34bd10c 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -194,8 +194,8 @@ class CometNativeShuffleWriter[K, V]( CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt) val shuffleMode = CometConf.COMET_NATIVE_SHUFFLE_MODE.get() match { - case "immediate" => OperatorOuterClass.ShuffleMode.ImmediateShuffle - case _ => OperatorOuterClass.ShuffleMode.DefaultShuffle + case "buffered" => OperatorOuterClass.ShuffleMode.BufferedShuffle + case _ => OperatorOuterClass.ShuffleMode.ImmediateShuffle } shuffleWriterBuilder.setShuffleMode(shuffleMode) From fb426cfe08e93aa119a34067c5b5a4bbf0d7e3dc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 13:56:48 -0600 Subject: [PATCH 13/18] perf: replace per-partition temp files with in-memory buffers Replace per-partition temp files in ImmediateShufflePartitioner with Vec buffers holding compressed IPC blocks. Eliminates 200 temp file creates/writes/reads per task and the double-write overhead that caused 2x slowdown on clusters with concurrent tasks. Add --concurrent-tasks flag to shuffle_bench for realistic executor simulation. Benchmark (10M rows, 200 partitions, hash on cols 0,3, zstd): Single task: buffered 4.78s vs immediate 3.97s (-17%) 8 concurrent tasks: buffered 6.75s vs immediate 5.94s (-12%) --- native/shuffle/src/bin/shuffle_bench.rs | 176 ++++++++++++++++-- .../src/partitioners/immediate_partition.rs | 83 ++++----- native/shuffle/src/shuffle_writer.rs | 1 - .../shuffle/src/writers/buf_batch_writer.rs | 7 + 4 files changed, 197 insertions(+), 70 deletions(-) diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index 4dba0d8c41..67dbb9fcc9 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -119,6 +119,11 @@ struct Args { /// Shuffle mode: buffered or immediate #[arg(long, default_value = "immediate")] shuffle_mode: String, + + /// Number of concurrent shuffle tasks to simulate executor parallelism. + /// Each task reads the same input and writes to its own output files. + #[arg(long, default_value_t = 1)] + concurrent_tasks: usize, } fn main() { @@ -151,6 +156,9 @@ fn main() { if let Some(mem_limit) = args.memory_limit { println!("Memory limit: {}", format_bytes(mem_limit)); } + if args.concurrent_tasks > 1 { + println!("Concurrent: {} tasks", args.concurrent_tasks); + } println!( "Iterations: {} (warmup: {})", args.iterations, args.warmup @@ -170,15 +178,25 @@ fn main() { format!("iter {}/{}", i - args.warmup + 1, args.iterations) }; - let write_elapsed = run_shuffle_write( - &args.input, - &schema, - &codec, - &hash_col_indices, - &args, - data_file.to_str().unwrap(), - index_file.to_str().unwrap(), - ); + let write_elapsed = if args.concurrent_tasks > 1 { + run_concurrent_shuffle_writes( + &args.input, + &schema, + &codec, + &hash_col_indices, + &args, + ) + } else { + run_shuffle_write( + &args.input, + &schema, + &codec, + &hash_col_indices, + &args, + data_file.to_str().unwrap(), + index_file.to_str().unwrap(), + ) + }; let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0); if !is_warmup { @@ -187,9 +205,11 @@ fn main() { } print!(" [{label}] write: {:.3}s", write_elapsed); - print!(" output: {}", format_bytes(data_size as usize)); + if args.concurrent_tasks <= 1 { + print!(" output: {}", format_bytes(data_size as usize)); + } - if args.read_back { + if args.read_back && args.concurrent_tasks <= 1 { let read_elapsed = run_shuffle_read( data_file.to_str().unwrap(), index_file.to_str().unwrap(), @@ -208,8 +228,8 @@ fn main() { println!("=== Results ==="); let avg_write = write_times.iter().sum::() / write_times.len() as f64; - let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; - let write_throughput_rows = total_rows as f64 / avg_write; + let write_throughput_rows = + (total_rows as f64 * args.concurrent_tasks as f64) / avg_write; println!("Write:"); println!(" avg time: {:.3}s", avg_write); @@ -222,15 +242,22 @@ fn main() { println!(" min/max: {:.3}s / {:.3}s", min, max); } println!( - " throughput: {} rows/s", - format_number(write_throughput_rows as usize) - ); - println!( - " output size: {}", - format_bytes(avg_data_size as usize) + " throughput: {} rows/s (total across {} tasks)", + format_number(write_throughput_rows as usize), + args.concurrent_tasks ); + if args.concurrent_tasks <= 1 { + let avg_data_size = + data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; + println!( + " output size: {}", + format_bytes(avg_data_size as usize) + ); + } if !read_times.is_empty() { + let avg_data_size = + data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; let avg_read = read_times.iter().sum::() / read_times.len() as f64; let read_throughput_bytes = avg_data_size as f64 / avg_read; @@ -381,6 +408,117 @@ fn run_shuffle_write( }) } +/// Run N concurrent shuffle writes to simulate executor parallelism. +/// Returns wall-clock time for all tasks to complete. +fn run_concurrent_shuffle_writes( + input_path: &PathBuf, + schema: &SchemaRef, + codec: &CompressionCodec, + hash_col_indices: &[usize], + args: &Args, +) -> f64 { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let start = Instant::now(); + + let mut handles = Vec::with_capacity(args.concurrent_tasks); + for task_id in 0..args.concurrent_tasks { + let task_dir = args.output_dir.join(format!("task_{task_id}")); + fs::create_dir_all(&task_dir).expect("Failed to create task output directory"); + let data_file = task_dir.join("data.out").to_str().unwrap().to_string(); + let index_file = task_dir.join("index.out").to_str().unwrap().to_string(); + + let input_path = input_path.clone(); + let schema = Arc::clone(schema); + let codec = codec.clone(); + let hash_col_indices = hash_col_indices.to_vec(); + let partitioning_scheme = args.partitioning.clone(); + let num_partitions = args.partitions; + let batch_size = args.batch_size; + let memory_limit = args.memory_limit; + let write_buffer_size = args.write_buffer_size; + let shuffle_mode_str = args.shuffle_mode.clone(); + let limit = args.limit; + + let handle = tokio::spawn(async move { + let config = SessionConfig::new().with_batch_size(batch_size); + let mut runtime_builder = RuntimeEnvBuilder::new(); + if let Some(mem_limit) = memory_limit { + runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0); + } + let runtime_env = Arc::new(runtime_builder.build().unwrap()); + let ctx = SessionContext::new_with_config_rt(config, runtime_env); + + let path_str = input_path.to_str().unwrap().to_string(); + let mut df = ctx + .read_parquet(&path_str, ParquetReadOptions::default()) + .await + .expect("Failed to create Parquet scan"); + if limit > 0 { + df = df.limit(0, Some(limit)).unwrap(); + } + + let parquet_plan = df + .create_physical_plan() + .await + .expect("Failed to create physical plan"); + + let input: Arc = if parquet_plan + .properties() + .output_partitioning() + .partition_count() + > 1 + { + Arc::new(CoalescePartitionsExec::new(parquet_plan)) + } else { + parquet_plan + }; + + let partitioning = build_partitioning( + &partitioning_scheme, + num_partitions, + &hash_col_indices, + &schema, + ); + + let shuffle_mode = match shuffle_mode_str.as_str() { + "buffered" => ShuffleMode::Buffered, + _ => ShuffleMode::Immediate, + }; + + let exec = ShuffleWriterExec::try_new( + input, + partitioning, + codec, + data_file, + index_file, + false, + write_buffer_size, + shuffle_mode, + ) + .expect("Failed to create ShuffleWriterExec"); + + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + collect(stream).await.unwrap(); + }); + handles.push(handle); + } + + for handle in handles { + handle.await.expect("Task panicked"); + } + + // Clean up task output directories + for task_id in 0..args.concurrent_tasks { + let task_dir = args.output_dir.join(format!("task_{task_id}")); + let _ = fs::remove_dir_all(&task_dir); + } + + start.elapsed().as_secs_f64() + }) +} + fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 { let start = Instant::now(); diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index 65fd746f0b..f32ce5d6a8 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -16,11 +16,15 @@ // under the License. //! An immediate-mode shuffle partitioner that repartitions incoming batches and writes -//! directly to per-partition spill files. Unlike `MultiPartitionShuffleRepartitioner`, -//! this implementation does not buffer input batches in memory — it uses Arrow `take` -//! to extract per-partition slices and writes them immediately through per-partition -//! `BufBatchWriter`s. At `shuffle_write` time, the per-partition files are concatenated -//! into the final shuffle data file and index. +//! per-partition data to in-memory buffers containing compressed IPC blocks. Unlike +//! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow +//! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and +//! serializes them immediately through per-partition `BufBatchWriter`s into `Vec` +//! buffers. At `shuffle_write` time, the buffers are concatenated into the final +//! shuffle data file and index. +//! +//! Because the buffers hold compressed IPC (typically 5-20% of raw data size), the +//! memory footprint is much lower than holding raw Arrow batches. use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::scratch::ScratchSpace; @@ -31,32 +35,28 @@ use arrow::array::{ArrayRef, RecordBatch, UInt32Array}; use arrow::compute::take; use arrow::datatypes::SchemaRef; use datafusion::common::DataFusionError; -use datafusion::execution::disk_manager::RefCountedTempFile; -use datafusion::execution::runtime_env::RuntimeEnv; use datafusion_comet_common::tracing::{with_trace, with_trace_async}; use std::fmt; use std::fmt::{Debug, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Seek, Write}; -use std::sync::Arc; +use std::io::{BufWriter, Cursor, Seek, Write}; use tokio::time::Instant; -struct PartitionFileWriter { - temp_file: RefCountedTempFile, - writer: BufBatchWriter, +/// Per-partition in-memory buffer holding compressed IPC blocks. +struct PartitionBuffer { + writer: BufBatchWriter>>, } /// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using -/// `arrow::compute::take` and the per-partition slices are written directly to -/// per-partition temporary files via `BufBatchWriter`. No input batches are retained -/// in memory beyond the current one. +/// `arrow::compute::take` and the per-partition slices are serialized immediately into +/// per-partition `Vec` buffers (compressed IPC). No input `RecordBatch` objects are +/// retained in memory beyond the current one. pub(crate) struct ImmediateShufflePartitioner { output_data_file: String, output_index_file: String, - partition_writers: Vec>, + partition_buffers: Vec>, shuffle_block_writer: ShuffleBlockWriter, partitioning: CometPartitioning, - runtime: Arc, metrics: ShufflePartitionerMetrics, scratch: ScratchSpace, batch_size: usize, @@ -72,7 +72,6 @@ impl ImmediateShufflePartitioner { schema: SchemaRef, partitioning: CometPartitioning, metrics: ShufflePartitionerMetrics, - runtime: Arc, batch_size: usize, codec: CompressionCodec, tracing_enabled: bool, @@ -86,15 +85,14 @@ impl ImmediateShufflePartitioner { let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions); let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?; - let partition_writers = (0..num_output_partitions).map(|_| None).collect(); + let partition_buffers = (0..num_output_partitions).map(|_| None).collect(); Ok(Self { output_data_file, output_index_file, - partition_writers, + partition_buffers, shuffle_block_writer, partitioning, - runtime, metrics, scratch, batch_size, @@ -103,29 +101,16 @@ impl ImmediateShufflePartitioner { }) } - fn ensure_partition_writer(&mut self, partition_id: usize) -> datafusion::common::Result<()> { - if self.partition_writers[partition_id].is_none() { - let temp_file = self - .runtime - .disk_manager - .create_tmp_file(&format!("immediate_shuffle_partition_{partition_id}"))?; - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(temp_file.path()) - .map_err(|e| { - DataFusionError::Execution(format!("Error creating partition file: {e}")) - })?; + fn ensure_partition_buffer(&mut self, partition_id: usize) { + if self.partition_buffers[partition_id].is_none() { let writer = BufBatchWriter::new( self.shuffle_block_writer.clone(), - file, + Cursor::new(Vec::new()), self.write_buffer_size, self.batch_size, ); - self.partition_writers[partition_id] = Some(PartitionFileWriter { temp_file, writer }); + self.partition_buffers[partition_id] = Some(PartitionBuffer { writer }); } - Ok(()) } fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> { @@ -177,9 +162,9 @@ impl ImmediateShufflePartitioner { let partition_batch = sorted_batch.slice(start, end - start); - self.ensure_partition_writer(partition_id)?; - let pw = self.partition_writers[partition_id].as_mut().unwrap(); - pw.writer.write( + self.ensure_partition_buffer(partition_id); + let pb = self.partition_buffers[partition_id].as_mut().unwrap(); + pb.writer.write( &partition_batch, &self.metrics.encode_time, &self.metrics.write_time, @@ -221,7 +206,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { with_trace("immediate_shuffle_write", self.tracing_enabled, || { let start_time = Instant::now(); - let num_output_partitions = self.partition_writers.len(); + let num_output_partitions = self.partition_buffers.len(); let mut offsets = vec![0u64; num_output_partitions + 1]; let output_data = OpenOptions::new() @@ -233,18 +218,16 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { let mut output_data = BufWriter::new(output_data); - for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() { + for (partition_id, pb_slot) in self.partition_buffers.iter_mut().enumerate() { offsets[partition_id] = output_data.stream_position()?; - if let Some(mut pw) = pw_slot.take() { - pw.writer + if let Some(mut pb) = pb_slot.take() { + pb.writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; - // Reopen for read without BufReader to allow kernel zero-copy - // (copy_file_range/sendfile) on supported platforms - let mut spill_file = File::open(pw.temp_file.path())?; + let buf = pb.writer.into_writer().into_inner(); let mut write_timer = self.metrics.write_time.timer(); - std::io::copy(&mut spill_file, &mut output_data)?; + output_data.write_all(&buf)?; write_timer.stop(); } } @@ -279,7 +262,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { impl Debug for ImmediateShufflePartitioner { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("ImmediateShufflePartitioner") - .field("num_partitions", &self.partition_writers.len()) + .field("num_partitions", &self.partition_buffers.len()) .finish() } } diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index bf7dc33e20..1079f4fc12 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -241,7 +241,6 @@ async fn external_shuffle( Arc::clone(&schema), partitioning, metrics, - context.runtime_env(), context.session_config().batch_size(), codec, tracing_enabled, diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs index cfddb46539..939af362b9 100644 --- a/native/shuffle/src/writers/buf_batch_writer.rs +++ b/native/shuffle/src/writers/buf_batch_writer.rs @@ -135,6 +135,13 @@ impl, W: Write> BufBatchWriter { } } +impl, W: Write> BufBatchWriter { + /// Consume this BufBatchWriter and return the underlying writer. + pub(crate) fn into_writer(self) -> W { + self.writer + } +} + impl, W: Write + Seek> BufBatchWriter { pub(crate) fn writer_stream_position(&mut self) -> datafusion::common::Result { self.writer.stream_position().map_err(Into::into) From 705130d9094ce8b5f4c710cd2d9ac4dcbc30ba7f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 14:05:27 -0600 Subject: [PATCH 14/18] feat: add memory accounting and spilling to immediate shuffle Register MemoryReservation with DataFusion's memory pool. After each batch, track in-memory buffer growth. When try_grow fails (memory pressure), spill all partition buffers to per-partition temp files, clear the Vec buffers, and free the reservation. In shuffle_write, copy spill files first then remaining in-memory data. Benchmark (10M rows, 200 partitions, zstd): No limit: 3.98s (2.51M rows/s) 50MB limit: 4.33s (2.31M rows/s) -- 9% spilling overhead --- native/shuffle/src/bin/shuffle_bench.rs | 17 +- .../src/partitioners/immediate_partition.rs | 155 ++++++++++++++++-- native/shuffle/src/shuffle_writer.rs | 2 + .../shuffle/src/writers/buf_batch_writer.rs | 16 ++ 4 files changed, 163 insertions(+), 27 deletions(-) diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index 67dbb9fcc9..079a890a19 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -179,13 +179,7 @@ fn main() { }; let write_elapsed = if args.concurrent_tasks > 1 { - run_concurrent_shuffle_writes( - &args.input, - &schema, - &codec, - &hash_col_indices, - &args, - ) + run_concurrent_shuffle_writes(&args.input, &schema, &codec, &hash_col_indices, &args) } else { run_shuffle_write( &args.input, @@ -228,8 +222,7 @@ fn main() { println!("=== Results ==="); let avg_write = write_times.iter().sum::() / write_times.len() as f64; - let write_throughput_rows = - (total_rows as f64 * args.concurrent_tasks as f64) / avg_write; + let write_throughput_rows = (total_rows as f64 * args.concurrent_tasks as f64) / avg_write; println!("Write:"); println!(" avg time: {:.3}s", avg_write); @@ -247,8 +240,7 @@ fn main() { args.concurrent_tasks ); if args.concurrent_tasks <= 1 { - let avg_data_size = - data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; + let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; println!( " output size: {}", format_bytes(avg_data_size as usize) @@ -256,8 +248,7 @@ fn main() { } if !read_times.is_empty() { - let avg_data_size = - data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; + let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; let avg_read = read_times.iter().sum::() / read_times.len() as f64; let read_throughput_bytes = avg_data_size as f64 / avg_read; diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index f32ce5d6a8..59393391d7 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -20,11 +20,9 @@ //! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow //! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and //! serializes them immediately through per-partition `BufBatchWriter`s into `Vec` -//! buffers. At `shuffle_write` time, the buffers are concatenated into the final -//! shuffle data file and index. -//! -//! Because the buffers hold compressed IPC (typically 5-20% of raw data size), the -//! memory footprint is much lower than holding raw Arrow batches. +//! buffers. Under memory pressure, buffers are spilled to per-partition temp files. +//! At `shuffle_write` time, spill files (if any) and remaining in-memory buffers are +//! concatenated into the final shuffle data file and index. use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::scratch::ScratchSpace; @@ -35,31 +33,46 @@ use arrow::array::{ArrayRef, RecordBatch, UInt32Array}; use arrow::compute::take; use arrow::datatypes::SchemaRef; use datafusion::common::DataFusionError; +use datafusion::execution::disk_manager::RefCountedTempFile; +use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; +use datafusion::execution::runtime_env::RuntimeEnv; use datafusion_comet_common::tracing::{with_trace, with_trace_async}; use std::fmt; use std::fmt::{Debug, Formatter}; use std::fs::{File, OpenOptions}; use std::io::{BufWriter, Cursor, Seek, Write}; +use std::sync::Arc; use tokio::time::Instant; -/// Per-partition in-memory buffer holding compressed IPC blocks. +/// Per-partition state: in-memory buffer for compressed IPC, plus an optional +/// spill file from prior memory-pressure events. struct PartitionBuffer { writer: BufBatchWriter>>, + /// Spill file accumulating data from prior spill events. Created lazily + /// on first spill. Subsequent spills append to the same file. + spill_file: Option, } -/// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using -/// `arrow::compute::take` and the per-partition slices are serialized immediately into -/// per-partition `Vec` buffers (compressed IPC). No input `RecordBatch` objects are -/// retained in memory beyond the current one. +struct SpillFile { + _temp_file: RefCountedTempFile, + file: File, +} + +/// An immediate-mode shuffle partitioner with memory accounting and spilling. +/// Each incoming batch is repartitioned using `arrow::compute::take` and the +/// per-partition slices are serialized into per-partition `Vec` buffers +/// (compressed IPC). Under memory pressure, buffers are spilled to temp files. pub(crate) struct ImmediateShufflePartitioner { output_data_file: String, output_index_file: String, partition_buffers: Vec>, shuffle_block_writer: ShuffleBlockWriter, partitioning: CometPartitioning, + runtime: Arc, metrics: ShufflePartitionerMetrics, scratch: ScratchSpace, batch_size: usize, + reservation: MemoryReservation, tracing_enabled: bool, write_buffer_size: usize, } @@ -67,11 +80,13 @@ pub(crate) struct ImmediateShufflePartitioner { impl ImmediateShufflePartitioner { #[allow(clippy::too_many_arguments)] pub(crate) fn try_new( + partition: usize, output_data_file: String, output_index_file: String, schema: SchemaRef, partitioning: CometPartitioning, metrics: ShufflePartitionerMetrics, + runtime: Arc, batch_size: usize, codec: CompressionCodec, tracing_enabled: bool, @@ -87,15 +102,21 @@ impl ImmediateShufflePartitioner { let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?; let partition_buffers = (0..num_output_partitions).map(|_| None).collect(); + let reservation = MemoryConsumer::new(format!("ImmediateShufflePartitioner[{partition}]")) + .with_can_spill(true) + .register(&runtime.memory_pool); + Ok(Self { output_data_file, output_index_file, partition_buffers, shuffle_block_writer, partitioning, + runtime, metrics, scratch, batch_size, + reservation, tracing_enabled, write_buffer_size, }) @@ -109,8 +130,92 @@ impl ImmediateShufflePartitioner { self.write_buffer_size, self.batch_size, ); - self.partition_buffers[partition_id] = Some(PartitionBuffer { writer }); + self.partition_buffers[partition_id] = Some(PartitionBuffer { + writer, + spill_file: None, + }); + } + } + + /// Spill all in-memory partition buffers to disk. + fn spill(&mut self) -> datafusion::common::Result<()> { + let total_buffered: usize = self + .partition_buffers + .iter() + .filter_map(|pb| pb.as_ref()) + .map(|pb| pb.writer.inner_buffer_len()) + .sum(); + + if total_buffered == 0 { + return Ok(()); + } + + log::info!( + "ImmediateShufflePartitioner spilling {} to disk ({} time(s) so far)", + total_buffered, + self.metrics.spill_count.value() + ); + + let mut spilled_bytes = 0usize; + + for pb in self.partition_buffers.iter_mut().flatten() { + // Flush the BufBatchWriter's coalescer into the Cursor> + pb.writer + .flush(&self.metrics.encode_time, &self.metrics.write_time)?; + + let buf = pb.writer.inner_mut().get_ref(); + if buf.is_empty() { + continue; + } + + // Ensure spill file exists + if pb.spill_file.is_none() { + let temp_file = self + .runtime + .disk_manager + .create_tmp_file("immediate_shuffle_spill")?; + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(temp_file.path()) + .map_err(|e| { + DataFusionError::Execution(format!("Error creating spill file: {e}")) + })?; + pb.spill_file = Some(SpillFile { + _temp_file: temp_file, + file, + }); + } + + // Append buffer contents to spill file + let spill = pb.spill_file.as_mut().unwrap(); + let mut write_timer = self.metrics.write_time.timer(); + spill.file.write_all(buf)?; + write_timer.stop(); + spilled_bytes += buf.len(); + + // Reset the in-memory buffer (keep allocated capacity for reuse) + let cursor = pb.writer.inner_mut(); + cursor.get_mut().clear(); + cursor.set_position(0); } + + self.reservation.free(); + self.metrics.spill_count.add(1); + self.metrics.spilled_bytes.add(spilled_bytes); + + Ok(()) + } + + /// Returns the total size of in-memory buffers across all partitions. + /// Includes both the BufBatchWriter staging buffer and the Cursor> output. + fn total_buffer_size(&self) -> usize { + self.partition_buffers + .iter() + .filter_map(|pb| pb.as_ref()) + .map(|pb| pb.writer.inner_buffer_len() + pb.writer.inner_ref().get_ref().len()) + .sum() } fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> { @@ -153,6 +258,8 @@ impl ImmediateShufflePartitioner { .collect::>>()?; let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?; + let size_before = self.total_buffer_size(); + for partition_id in 0..num_output_partitions { let start = scratch.partition_starts[partition_id] as usize; let end = scratch.partition_starts[partition_id + 1] as usize; @@ -171,6 +278,12 @@ impl ImmediateShufflePartitioner { )?; } + let size_after = self.total_buffer_size(); + let mem_growth = size_after.saturating_sub(size_before); + if mem_growth > 0 && self.reservation.try_grow(mem_growth).is_err() { + self.spill()?; + } + self.scratch = scratch; Ok(()) } @@ -222,13 +335,26 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { offsets[partition_id] = output_data.stream_position()?; if let Some(mut pb) = pb_slot.take() { + // Copy spill file contents first (from prior spill events) + if let Some(mut spill) = pb.spill_file.take() { + spill.file.flush()?; + let spill_path = spill._temp_file.path().to_path_buf(); + let mut spill_reader = File::open(&spill_path)?; + let mut write_timer = self.metrics.write_time.timer(); + std::io::copy(&mut spill_reader, &mut output_data)?; + write_timer.stop(); + } + + // Flush and write remaining in-memory buffer pb.writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; let buf = pb.writer.into_writer().into_inner(); - let mut write_timer = self.metrics.write_time.timer(); - output_data.write_all(&buf)?; - write_timer.stop(); + if !buf.is_empty() { + let mut write_timer = self.metrics.write_time.timer(); + output_data.write_all(&buf)?; + write_timer.stop(); + } } } @@ -263,6 +389,7 @@ impl Debug for ImmediateShufflePartitioner { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("ImmediateShufflePartitioner") .field("num_partitions", &self.partition_buffers.len()) + .field("memory_used", &self.reservation.size()) .finish() } } diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index 1079f4fc12..67314b4961 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -236,11 +236,13 @@ async fn external_shuffle( } _ if shuffle_mode == ShuffleMode::Immediate => { Box::new(ImmediateShufflePartitioner::try_new( + partition, output_data_file, output_index_file, Arc::clone(&schema), partitioning, metrics, + context.runtime_env(), context.session_config().batch_size(), codec, tracing_enabled, diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs index 939af362b9..407df4ee9d 100644 --- a/native/shuffle/src/writers/buf_batch_writer.rs +++ b/native/shuffle/src/writers/buf_batch_writer.rs @@ -140,6 +140,22 @@ impl, W: Write> BufBatchWriter { pub(crate) fn into_writer(self) -> W { self.writer } + + /// Returns a reference to the underlying writer. + pub(crate) fn inner_ref(&self) -> &W { + &self.writer + } + + /// Returns a mutable reference to the underlying writer. + pub(crate) fn inner_mut(&mut self) -> &mut W { + &mut self.writer + } + + /// Returns the total buffered size: the internal byte buffer plus + /// the BufBatchWriter's own serialization buffer. + pub(crate) fn inner_buffer_len(&self) -> usize { + self.buffer.len() + } } impl, W: Write + Seek> BufBatchWriter { From 862d01e0bcf9c5b758137dfc0afac9466558e315 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 14:12:50 -0600 Subject: [PATCH 15/18] refactor: encapsulate buffer access and extract shared index writer - Replace leaky inner_ref/inner_mut/inner_buffer_len methods on BufBatchWriter with encapsulated Cursor>-specific methods: buffered_output_size, reset_output_buffer, output_bytes - Extract write_index_file utility, replacing duplicate code in all three partitioners - Rename _temp_file to temp_file (RAII guard, not unused) - Remove unnecessary comments --- .../src/partitioners/immediate_partition.rs | 73 ++++--------------- .../src/partitioners/multi_partition.rs | 9 +-- .../src/partitioners/single_partition.rs | 14 +--- .../shuffle/src/writers/buf_batch_writer.rs | 26 ++++--- native/shuffle/src/writers/mod.rs | 17 +++++ 5 files changed, 50 insertions(+), 89 deletions(-) diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index 59393391d7..44a42e3fa4 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -15,19 +15,14 @@ // specific language governing permissions and limitations // under the License. -//! An immediate-mode shuffle partitioner that repartitions incoming batches and writes -//! per-partition data to in-memory buffers containing compressed IPC blocks. Unlike -//! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow -//! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and -//! serializes them immediately through per-partition `BufBatchWriter`s into `Vec` -//! buffers. Under memory pressure, buffers are spilled to per-partition temp files. -//! At `shuffle_write` time, spill files (if any) and remaining in-memory buffers are -//! concatenated into the final shuffle data file and index. +//! Immediate-mode shuffle partitioner: repartitions each incoming batch via Arrow `take`, +//! serializes per-partition slices into in-memory compressed IPC buffers, and spills to +//! disk under memory pressure. use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::scratch::ScratchSpace; use crate::partitioners::ShufflePartitioner; -use crate::writers::BufBatchWriter; +use crate::writers::{write_index_file, BufBatchWriter}; use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter}; use arrow::array::{ArrayRef, RecordBatch, UInt32Array}; use arrow::compute::take; @@ -44,24 +39,17 @@ use std::io::{BufWriter, Cursor, Seek, Write}; use std::sync::Arc; use tokio::time::Instant; -/// Per-partition state: in-memory buffer for compressed IPC, plus an optional -/// spill file from prior memory-pressure events. struct PartitionBuffer { writer: BufBatchWriter>>, - /// Spill file accumulating data from prior spill events. Created lazily - /// on first spill. Subsequent spills append to the same file. spill_file: Option, } struct SpillFile { - _temp_file: RefCountedTempFile, + /// Kept alive to prevent temp file deletion (RAII guard). + temp_file: RefCountedTempFile, file: File, } -/// An immediate-mode shuffle partitioner with memory accounting and spilling. -/// Each incoming batch is repartitioned using `arrow::compute::take` and the -/// per-partition slices are serialized into per-partition `Vec` buffers -/// (compressed IPC). Under memory pressure, buffers are spilled to temp files. pub(crate) struct ImmediateShufflePartitioner { output_data_file: String, output_index_file: String, @@ -137,38 +125,27 @@ impl ImmediateShufflePartitioner { } } - /// Spill all in-memory partition buffers to disk. fn spill(&mut self) -> datafusion::common::Result<()> { - let total_buffered: usize = self - .partition_buffers - .iter() - .filter_map(|pb| pb.as_ref()) - .map(|pb| pb.writer.inner_buffer_len()) - .sum(); - - if total_buffered == 0 { + if self.partition_buffers.iter().all(|pb| pb.is_none()) { return Ok(()); } log::info!( - "ImmediateShufflePartitioner spilling {} to disk ({} time(s) so far)", - total_buffered, + "ImmediateShufflePartitioner spilling to disk ({} time(s) so far)", self.metrics.spill_count.value() ); let mut spilled_bytes = 0usize; for pb in self.partition_buffers.iter_mut().flatten() { - // Flush the BufBatchWriter's coalescer into the Cursor> pb.writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; - let buf = pb.writer.inner_mut().get_ref(); + let buf = pb.writer.output_bytes(); if buf.is_empty() { continue; } - // Ensure spill file exists if pb.spill_file.is_none() { let temp_file = self .runtime @@ -182,23 +159,16 @@ impl ImmediateShufflePartitioner { .map_err(|e| { DataFusionError::Execution(format!("Error creating spill file: {e}")) })?; - pb.spill_file = Some(SpillFile { - _temp_file: temp_file, - file, - }); + pb.spill_file = Some(SpillFile { temp_file, file }); } - // Append buffer contents to spill file let spill = pb.spill_file.as_mut().unwrap(); let mut write_timer = self.metrics.write_time.timer(); spill.file.write_all(buf)?; write_timer.stop(); spilled_bytes += buf.len(); - // Reset the in-memory buffer (keep allocated capacity for reuse) - let cursor = pb.writer.inner_mut(); - cursor.get_mut().clear(); - cursor.set_position(0); + pb.writer.reset_output_buffer(); } self.reservation.free(); @@ -208,13 +178,11 @@ impl ImmediateShufflePartitioner { Ok(()) } - /// Returns the total size of in-memory buffers across all partitions. - /// Includes both the BufBatchWriter staging buffer and the Cursor> output. fn total_buffer_size(&self) -> usize { self.partition_buffers .iter() .filter_map(|pb| pb.as_ref()) - .map(|pb| pb.writer.inner_buffer_len() + pb.writer.inner_ref().get_ref().len()) + .map(|pb| pb.writer.buffered_output_size()) .sum() } @@ -278,8 +246,7 @@ impl ImmediateShufflePartitioner { )?; } - let size_after = self.total_buffer_size(); - let mem_growth = size_after.saturating_sub(size_before); + let mem_growth = self.total_buffer_size().saturating_sub(size_before); if mem_growth > 0 && self.reservation.try_grow(mem_growth).is_err() { self.spill()?; } @@ -335,17 +302,14 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { offsets[partition_id] = output_data.stream_position()?; if let Some(mut pb) = pb_slot.take() { - // Copy spill file contents first (from prior spill events) if let Some(mut spill) = pb.spill_file.take() { spill.file.flush()?; - let spill_path = spill._temp_file.path().to_path_buf(); - let mut spill_reader = File::open(&spill_path)?; + let mut spill_reader = File::open(spill.temp_file.path())?; let mut write_timer = self.metrics.write_time.timer(); std::io::copy(&mut spill_reader, &mut output_data)?; write_timer.stop(); } - // Flush and write remaining in-memory buffer pb.writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; @@ -365,14 +329,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner { offsets[num_output_partitions] = output_data.stream_position()?; let mut write_timer = self.metrics.write_time.timer(); - let mut output_index = - BufWriter::new(File::create(&self.output_index_file).map_err(|e| { - DataFusionError::Execution(format!("shuffle write error: {e:?}")) - })?); - for offset in offsets { - output_index.write_all(&(offset as i64).to_le_bytes()[..])?; - } - output_index.flush()?; + write_index_file(&self.output_index_file, &offsets)?; write_timer.stop(); self.metrics diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs index 8a5007142c..ccc3ed225e 100644 --- a/native/shuffle/src/partitioners/multi_partition.rs +++ b/native/shuffle/src/partitioners/multi_partition.rs @@ -371,14 +371,7 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner { offsets[num_output_partitions] = output_data.stream_position()?; let mut write_timer = self.metrics.write_time.timer(); - let mut output_index = - BufWriter::new(File::create(index_file).map_err(|e| { - DataFusionError::Execution(format!("shuffle write error: {e:?}")) - })?); - for offset in offsets { - output_index.write_all(&(offset as i64).to_le_bytes()[..])?; - } - output_index.flush()?; + crate::writers::write_index_file(&index_file, &offsets)?; write_timer.stop(); self.metrics diff --git a/native/shuffle/src/partitioners/single_partition.rs b/native/shuffle/src/partitioners/single_partition.rs index 5801ef613b..778c1d9416 100644 --- a/native/shuffle/src/partitioners/single_partition.rs +++ b/native/shuffle/src/partitioners/single_partition.rs @@ -23,7 +23,6 @@ use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; use datafusion::common::DataFusionError; use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Write}; use tokio::time::Instant; /// A partitioner that writes all shuffle data to a single file and a single index file @@ -169,19 +168,8 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner { self.output_data_writer .flush(&self.metrics.encode_time, &self.metrics.write_time)?; - // Write index file. It should only contain 2 entries: 0 and the total number of bytes written - let index_file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(self.output_index_path.clone()) - .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?; - let mut index_buf_writer = BufWriter::new(index_file); let data_file_length = self.output_data_writer.writer_stream_position()?; - for offset in [0, data_file_length] { - index_buf_writer.write_all(&(offset as i64).to_le_bytes()[..])?; - } - index_buf_writer.flush()?; + crate::writers::write_index_file(&self.output_index_path, &[0, data_file_length])?; self.metrics .baseline diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs index 407df4ee9d..84f658240d 100644 --- a/native/shuffle/src/writers/buf_batch_writer.rs +++ b/native/shuffle/src/writers/buf_batch_writer.rs @@ -140,21 +140,27 @@ impl, W: Write> BufBatchWriter { pub(crate) fn into_writer(self) -> W { self.writer } +} - /// Returns a reference to the underlying writer. - pub(crate) fn inner_ref(&self) -> &W { - &self.writer +impl> BufBatchWriter>> { + /// Returns the total bytes buffered: the staging buffer (pre-flush to writer) + /// plus the accumulated output in the Cursor's Vec. + pub(crate) fn buffered_output_size(&self) -> usize { + self.buffer.len() + self.writer.get_ref().len() } - /// Returns a mutable reference to the underlying writer. - pub(crate) fn inner_mut(&mut self) -> &mut W { - &mut self.writer + /// Reset the output buffer, keeping allocated capacity for reuse. + /// Returns the number of bytes that were in the output buffer. + pub(crate) fn reset_output_buffer(&mut self) -> usize { + let len = self.writer.get_ref().len(); + self.writer.get_mut().clear(); + self.writer.set_position(0); + len } - /// Returns the total buffered size: the internal byte buffer plus - /// the BufBatchWriter's own serialization buffer. - pub(crate) fn inner_buffer_len(&self) -> usize { - self.buffer.len() + /// Returns a reference to the accumulated output bytes. + pub(crate) fn output_bytes(&self) -> &[u8] { + self.writer.get_ref() } } diff --git a/native/shuffle/src/writers/mod.rs b/native/shuffle/src/writers/mod.rs index 75caf9f3a3..8de3c7ec9d 100644 --- a/native/shuffle/src/writers/mod.rs +++ b/native/shuffle/src/writers/mod.rs @@ -24,3 +24,20 @@ pub(crate) use buf_batch_writer::BufBatchWriter; pub(crate) use checksum::Checksum; pub use shuffle_block_writer::{CompressionCodec, ShuffleBlockWriter}; pub(crate) use spill::PartitionWriter; + +use datafusion::common::DataFusionError; +use std::fs::File; +use std::io::{BufWriter, Write}; + +/// Write shuffle index file: an array of i64 little-endian byte offsets. +pub(crate) fn write_index_file(path: &str, offsets: &[u64]) -> datafusion::common::Result<()> { + let mut writer = BufWriter::new( + File::create(path) + .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?, + ); + for &offset in offsets { + writer.write_all(&(offset as i64).to_le_bytes()[..])?; + } + writer.flush()?; + Ok(()) +} From e7d38daaef77de89cd23db306fd8e78c9e93fd6a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 15:14:51 -0600 Subject: [PATCH 16/18] fix: use per-partition take instead of full-batch reorder The take-then-slice approach (single take to reorder entire batch, then zero-copy slice per partition) caused the BatchCoalescer to copy all data again via copy_rows, resulting in 2x data movement. When combined with concat_batches (attempted optimization), it produced 66 GiB output instead of 396 MiB due to missing coalescing. Switch to per-partition take with shared UInt32Array indices (sliced per partition to avoid allocation). Each partition's small batch (~40 rows) goes through BufBatchWriter's BatchCoalescer which handles proper IPC block coalescing and compression. Benchmark (10M rows, 200 partitions, zstd): buffered: 4.84s (2.07M rows/s) immediate: 4.38s (2.28M rows/s) -- 10% faster --- .../src/partitioners/immediate_partition.rs | 52 +++++++++---------- .../shuffle/src/writers/buf_batch_writer.rs | 10 ++-- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index 44a42e3fa4..f69e5addc1 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Immediate-mode shuffle partitioner: repartitions each incoming batch via Arrow `take`, -//! serializes per-partition slices into in-memory compressed IPC buffers, and spills to -//! disk under memory pressure. +//! Immediate-mode shuffle partitioner: repartitions each incoming batch via per-partition +//! Arrow `take`, serializes through per-partition `BufBatchWriter`s into in-memory +//! compressed IPC buffers, and spills to disk under memory pressure. use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::scratch::ScratchSpace; @@ -126,10 +126,6 @@ impl ImmediateShufflePartitioner { } fn spill(&mut self) -> datafusion::common::Result<()> { - if self.partition_buffers.iter().all(|pb| pb.is_none()) { - return Ok(()); - } - log::info!( "ImmediateShufflePartitioner spilling to disk ({} time(s) so far)", self.metrics.spill_count.value() @@ -178,14 +174,6 @@ impl ImmediateShufflePartitioner { Ok(()) } - fn total_buffer_size(&self) -> usize { - self.partition_buffers - .iter() - .filter_map(|pb| pb.as_ref()) - .map(|pb| pb.writer.buffered_output_size()) - .sum() - } - fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> { if input.num_rows() == 0 { return Ok(()); @@ -210,21 +198,14 @@ impl ImmediateShufflePartitioner { timer.stop(); } - // Single take per column to reorder entire batch by partition assignment, - // then zero-copy slice per partition. This replaces P*C take calls with just C. + // Build a single UInt32Array from partition_row_indices and slice per partition + // to avoid per-partition allocation. Per-partition take produces small batches + // (~40 rows) that the BufBatchWriter's BatchCoalescer accumulates into batch_size + // chunks before serializing to compressed IPC. let num_rows = input.num_rows(); let all_indices = UInt32Array::from_iter_values( scratch.partition_row_indices[..num_rows].iter().copied(), ); - let sorted_columns: Vec = input - .columns() - .iter() - .map(|col| { - take(col, &all_indices, None) - .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) - }) - .collect::>>()?; - let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?; let size_before = self.total_buffer_size(); @@ -235,7 +216,16 @@ impl ImmediateShufflePartitioner { continue; } - let partition_batch = sorted_batch.slice(start, end - start); + let indices = all_indices.slice(start, end - start); + let columns: Vec = input + .columns() + .iter() + .map(|col| { + take(col, &indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) + }) + .collect::>>()?; + let partition_batch = RecordBatch::try_new(input.schema(), columns)?; self.ensure_partition_buffer(partition_id); let pb = self.partition_buffers[partition_id].as_mut().unwrap(); @@ -254,6 +244,14 @@ impl ImmediateShufflePartitioner { self.scratch = scratch; Ok(()) } + + fn total_buffer_size(&self) -> usize { + self.partition_buffers + .iter() + .filter_map(|pb| pb.as_ref()) + .map(|pb| pb.writer.buffered_output_size()) + .sum() + } } #[async_trait::async_trait] diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs index 84f658240d..0b10396ead 100644 --- a/native/shuffle/src/writers/buf_batch_writer.rs +++ b/native/shuffle/src/writers/buf_batch_writer.rs @@ -143,22 +143,18 @@ impl, W: Write> BufBatchWriter { } impl> BufBatchWriter>> { - /// Returns the total bytes buffered: the staging buffer (pre-flush to writer) - /// plus the accumulated output in the Cursor's Vec. + /// Total bytes buffered: staging buffer + accumulated compressed IPC output. pub(crate) fn buffered_output_size(&self) -> usize { self.buffer.len() + self.writer.get_ref().len() } /// Reset the output buffer, keeping allocated capacity for reuse. - /// Returns the number of bytes that were in the output buffer. - pub(crate) fn reset_output_buffer(&mut self) -> usize { - let len = self.writer.get_ref().len(); + pub(crate) fn reset_output_buffer(&mut self) { self.writer.get_mut().clear(); self.writer.set_position(0); - len } - /// Returns a reference to the accumulated output bytes. + /// Returns a reference to the accumulated compressed IPC output. pub(crate) fn output_bytes(&self) -> &[u8] { self.writer.get_ref() } From 86bae923cc5d45503bd8fd1f0fae356fb692c038 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Mar 2026 15:53:58 -0600 Subject: [PATCH 17/18] revert: default shuffle mode back to buffered --- common/src/main/scala/org/apache/comet/CometConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index 908c5638c1..3f8d6bd221 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -545,7 +545,7 @@ object CometConf extends ShimCometConf { "data directly to individual files, avoiding in-memory buffering of input batches.") .stringConf .checkValues(Set("buffered", "immediate")) - .createWithDefault("immediate") + .createWithDefault("buffered") val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( "spark.comet.shuffle.preferDictionary.ratio") From 3aade30844a57cd065054d34677dd440646b60c3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 29 Mar 2026 14:39:05 -0600 Subject: [PATCH 18/18] perf: single take() + zero-copy slice() instead of per-partition take() --- .../src/partitioners/immediate_partition.rs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs index f69e5addc1..1bad02155f 100644 --- a/native/shuffle/src/partitioners/immediate_partition.rs +++ b/native/shuffle/src/partitioners/immediate_partition.rs @@ -198,15 +198,24 @@ impl ImmediateShufflePartitioner { timer.stop(); } - // Build a single UInt32Array from partition_row_indices and slice per partition - // to avoid per-partition allocation. Per-partition take produces small batches - // (~40 rows) that the BufBatchWriter's BatchCoalescer accumulates into batch_size - // chunks before serializing to compressed IPC. + // Reorder the entire batch once by partition using a single take(), then + // use zero-copy slice() per partition. This replaces N per-partition take() + // calls (each allocating new arrays) with 1 take() + N free slice() calls. let num_rows = input.num_rows(); let all_indices = UInt32Array::from_iter_values( scratch.partition_row_indices[..num_rows].iter().copied(), ); + let reordered_columns: Vec = input + .columns() + .iter() + .map(|col| { + take(col, &all_indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) + }) + .collect::>>()?; + let reordered = RecordBatch::try_new(input.schema(), reordered_columns)?; + let size_before = self.total_buffer_size(); for partition_id in 0..num_output_partitions { @@ -216,16 +225,7 @@ impl ImmediateShufflePartitioner { continue; } - let indices = all_indices.slice(start, end - start); - let columns: Vec = input - .columns() - .iter() - .map(|col| { - take(col, &indices, None) - .map_err(|e| DataFusionError::ArrowError(Box::from(e), None)) - }) - .collect::>>()?; - let partition_batch = RecordBatch::try_new(input.schema(), columns)?; + let partition_batch = reordered.slice(start, end - start); self.ensure_partition_buffer(partition_id); let pb = self.partition_buffers[partition_id].as_mut().unwrap();