From ae0741b8d6f203d881de12f9f5d8f0d326f5093c Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 07:43:26 -0600
Subject: [PATCH 01/18] feat: add standalone shuffle benchmark binary for
 profiling

Add a `shuffle_bench` binary that benchmarks shuffle write and read
performance independently from Spark, making it easy to profile with
tools like `cargo flamegraph`, `perf`, or `instruments`.

Supports reading Parquet files (e.g. TPC-H/TPC-DS) or generating
synthetic data with configurable schema. Covers different scenarios
including compression codecs, partition counts, partitioning schemes,
and memory-constrained spilling.
---
 native/Cargo.lock                    |  88 +++-
 native/core/Cargo.toml               |   5 +
 native/core/src/bin/shuffle_bench.rs | 725 +++++++++++++++++++++++++++
 3 files changed, 816 insertions(+), 2 deletions(-)
 create mode 100644 native/core/src/bin/shuffle_bench.rs

diff --git a/native/Cargo.lock b/native/Cargo.lock
index 5f99c614b3..f43b41dd9a 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -96,12 +96,56 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
+[[package]]
+name = "anstream"
+version = "0.6.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
 [[package]]
 name = "anstyle"
 version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
 
+[[package]]
+name = "anstyle-parse"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.102"
@@ -1331,6 +1375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a"
 dependencies = [
  "clap_builder",
+ "clap_derive",
 ]
 
 [[package]]
@@ -1339,8 +1384,22 @@ version = "4.5.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876"
 dependencies = [
+ "anstream",
  "anstyle",
  "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.55"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1358,6 +1417,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
 [[package]]
 name = "combine"
 version = "4.6.7"
@@ -1834,6 +1899,7 @@ dependencies = [
  "aws-config",
  "aws-credential-types",
  "bytes",
+ "clap",
  "crc32fast",
  "criterion",
  "datafusion",
@@ -1885,7 +1951,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-comet-common"
-version = "0.14.0"
+version = "0.15.0"
 dependencies = [
  "arrow",
  "datafusion",
@@ -1911,7 +1977,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-comet-jni-bridge"
-version = "0.14.0"
+version = "0.15.0"
 dependencies = [
  "arrow",
  "assertables",
@@ -3609,6 +3675,12 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
 [[package]]
 name = "itertools"
 version = "0.13.0"
@@ -4289,6 +4361,12 @@ version = "1.21.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
 
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
 [[package]]
 name = "oorandom"
 version = "11.1.5"
@@ -6339,6 +6417,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "uuid"
 version = "1.22.0"
diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
index 3f305a631d..3df9e55719 100644
--- a/native/core/Cargo.toml
+++ b/native/core/Cargo.toml
@@ -72,6 +72,7 @@ url = { workspace = true }
 aws-config = { workspace = true }
 aws-credential-types = { workspace = true }
 parking_lot = "0.12.5"
+clap = { version = "4", features = ["derive"] }
 datafusion-comet-objectstore-hdfs = { path = "../hdfs", optional = true, default-features = false, features = ["hdfs"] }
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"] }
 object_store_opendal = {version = "0.55.0", optional = true}
@@ -113,6 +114,10 @@ name = "comet"
 # "rlib" is for benchmarking with criterion.
 crate-type = ["cdylib", "rlib"]
 
+[[bin]]
+name = "shuffle_bench"
+path = "src/bin/shuffle_bench.rs"
+
 [[bench]]
 name = "parquet_read"
 harness = false
diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
new file mode 100644
index 0000000000..c1498161f7
--- /dev/null
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -0,0 +1,725 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Standalone shuffle benchmark tool for profiling Comet shuffle write and read
+//! outside of Spark.
+//!
+//! # Usage
+//!
+//! Read from Parquet files (e.g. TPC-H lineitem):
+//! ```sh
+//! cargo run --release --bin shuffle_bench -- \
+//!   --input /data/tpch-sf100/lineitem/ \
+//!   --partitions 200 \
+//!   --codec zstd --zstd-level 1 \
+//!   --hash-columns 0,3 \
+//!   --read-back
+//! ```
+//!
+//! Generate synthetic data:
+//! ```sh
+//! cargo run --release --bin shuffle_bench -- \
+//!   --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \
+//!   --gen-decimal-cols 2 --gen-avg-string-len 32 \
+//!   --partitions 200 --codec lz4 --read-back
+//! ```
+//!
+//! Profile with flamegraph:
+//! ```sh
+//! cargo flamegraph --release --bin shuffle_bench -- \
+//!   --input /data/tpch-sf100/lineitem/ \
+//!   --partitions 200 --codec zstd --zstd-level 1
+//! ```
+
+use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder};
+use arrow::array::RecordBatch;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use clap::Parser;
+use comet::execution::shuffle::{
+    read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
+};
+use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::datasource::source::DataSourceExec;
+use datafusion::execution::config::SessionConfig;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_plan::common::collect;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionContext;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use rand::RngExt;
+use std::fs;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "shuffle_bench",
+    about = "Standalone benchmark for Comet shuffle write and read performance"
+)]
+struct Args {
+    /// Path to input Parquet file or directory of Parquet files
+    #[arg(long)]
+    input: Option<PathBuf>,
+
+    /// Generate synthetic data instead of reading from Parquet
+    #[arg(long, default_value_t = false)]
+    generate: bool,
+
+    /// Number of rows to generate (requires --generate)
+    #[arg(long, default_value_t = 1_000_000)]
+    gen_rows: usize,
+
+    /// Number of Int64 columns to generate
+    #[arg(long, default_value_t = 4)]
+    gen_int_cols: usize,
+
+    /// Number of Utf8 string columns to generate
+    #[arg(long, default_value_t = 2)]
+    gen_string_cols: usize,
+
+    /// Number of Decimal128 columns to generate
+    #[arg(long, default_value_t = 2)]
+    gen_decimal_cols: usize,
+
+    /// Number of Date32 columns to generate
+    #[arg(long, default_value_t = 1)]
+    gen_date_cols: usize,
+
+    /// Average string length for generated string columns
+    #[arg(long, default_value_t = 24)]
+    gen_avg_string_len: usize,
+
+    /// Batch size for reading Parquet or generating data
+    #[arg(long, default_value_t = 8192)]
+    batch_size: usize,
+
+    /// Number of output shuffle partitions
+    #[arg(long, default_value_t = 200)]
+    partitions: usize,
+
+    /// Partitioning scheme: hash, single, round-robin
+    #[arg(long, default_value = "hash")]
+    partitioning: String,
+
+    /// Column indices to hash on (comma-separated, e.g. "0,3")
+    #[arg(long, default_value = "0")]
+    hash_columns: String,
+
+    /// Compression codec: none, lz4, zstd, snappy
+    #[arg(long, default_value = "zstd")]
+    codec: String,
+
+    /// Zstd compression level (1-22)
+    #[arg(long, default_value_t = 1)]
+    zstd_level: i32,
+
+    /// Memory limit in bytes (triggers spilling when exceeded)
+    #[arg(long)]
+    memory_limit: Option<usize>,
+
+    /// Also benchmark reading back the shuffle output
+    #[arg(long, default_value_t = false)]
+    read_back: bool,
+
+    /// Number of iterations to run
+    #[arg(long, default_value_t = 1)]
+    iterations: usize,
+
+    /// Number of warmup iterations before timing
+    #[arg(long, default_value_t = 0)]
+    warmup: usize,
+
+    /// Output directory for shuffle data/index files
+    #[arg(long, default_value = "/tmp/comet_shuffle_bench")]
+    output_dir: PathBuf,
+
+    /// Write buffer size in bytes
+    #[arg(long, default_value_t = 1048576)]
+    write_buffer_size: usize,
+}
+
+fn main() {
+    let args = Args::parse();
+
+    // Validate args
+    if args.input.is_none() && !args.generate {
+        eprintln!("Error: must specify either --input <path> or --generate");
+        std::process::exit(1);
+    }
+
+    // Create output directory
+    fs::create_dir_all(&args.output_dir).expect("Failed to create output directory");
+
+    let data_file = args.output_dir.join("data.out");
+    let index_file = args.output_dir.join("index.out");
+
+    // Load data
+    let load_start = Instant::now();
+    let batches = if let Some(ref input_path) = args.input {
+        load_parquet(input_path, args.batch_size)
+    } else {
+        generate_data(&args)
+    };
+    let load_elapsed = load_start.elapsed();
+
+    let schema = batches[0].schema();
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
+
+    println!("=== Shuffle Benchmark ===");
+    println!(
+        "Data source:    {}",
+        if args.input.is_some() {
+            "parquet"
+        } else {
+            "generated"
+        }
+    );
+    println!(
+        "Schema:         {} columns ({} fields)",
+        schema.fields().len(),
+        describe_schema(&schema)
+    );
+    println!("Total rows:     {}", format_number(total_rows));
+    println!("Total size:     {}", format_bytes(total_bytes));
+    println!("Batches:        {}", batches.len());
+    println!(
+        "Rows/batch:     ~{}",
+        if batches.is_empty() {
+            0
+        } else {
+            total_rows / batches.len()
+        }
+    );
+    println!("Load time:      {:.3}s", load_elapsed.as_secs_f64());
+    println!();
+
+    let codec = parse_codec(&args.codec, args.zstd_level);
+    let hash_col_indices = parse_hash_columns(&args.hash_columns);
+
+    println!("Partitioning:   {}", args.partitioning);
+    println!("Partitions:     {}", args.partitions);
+    println!("Codec:          {:?}", codec);
+    println!("Hash columns:   {:?}", hash_col_indices);
+    if let Some(mem_limit) = args.memory_limit {
+        println!("Memory limit:   {}", format_bytes(mem_limit));
+    }
+    println!(
+        "Iterations:     {} (warmup: {})",
+        args.iterations, args.warmup
+    );
+    println!();
+
+    // Run warmup + timed iterations
+    let total_iters = args.warmup + args.iterations;
+    let mut write_times = Vec::with_capacity(args.iterations);
+    let mut read_times = Vec::with_capacity(args.iterations);
+    let mut data_file_sizes = Vec::with_capacity(args.iterations);
+
+    for i in 0..total_iters {
+        let is_warmup = i < args.warmup;
+        let label = if is_warmup {
+            format!("warmup {}/{}", i + 1, args.warmup)
+        } else {
+            format!("iter {}/{}", i - args.warmup + 1, args.iterations)
+        };
+
+        // Write phase
+        let write_elapsed = run_shuffle_write(
+            &batches,
+            &schema,
+            &codec,
+            &hash_col_indices,
+            &args,
+            data_file.to_str().unwrap(),
+            index_file.to_str().unwrap(),
+        );
+        let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0);
+
+        if !is_warmup {
+            write_times.push(write_elapsed);
+            data_file_sizes.push(data_size);
+        }
+
+        print!("  [{label}] write: {:.3}s", write_elapsed);
+        print!("  output: {}", format_bytes(data_size as usize));
+
+        // Read phase
+        if args.read_back {
+            let read_elapsed = run_shuffle_read(
+                data_file.to_str().unwrap(),
+                index_file.to_str().unwrap(),
+                args.partitions,
+            );
+            if !is_warmup {
+                read_times.push(read_elapsed);
+            }
+            print!("  read: {:.3}s", read_elapsed);
+        }
+        println!();
+    }
+
+    // Print summary
+    if args.iterations > 0 {
+        println!();
+        println!("=== Results ===");
+
+        let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
+        let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+        let write_throughput_rows = total_rows as f64 / avg_write;
+        let write_throughput_bytes = total_bytes as f64 / avg_write;
+        let compression_ratio = if avg_data_size > 0 {
+            total_bytes as f64 / avg_data_size as f64
+        } else {
+            0.0
+        };
+
+        println!("Write:");
+        println!("  avg time:         {:.3}s", avg_write);
+        if write_times.len() > 1 {
+            let min = write_times.iter().cloned().fold(f64::INFINITY, f64::min);
+            let max = write_times
+                .iter()
+                .cloned()
+                .fold(f64::NEG_INFINITY, f64::max);
+            println!("  min/max:          {:.3}s / {:.3}s", min, max);
+        }
+        println!(
+            "  throughput:       {}/s ({} rows/s)",
+            format_bytes(write_throughput_bytes as usize),
+            format_number(write_throughput_rows as usize)
+        );
+        println!(
+            "  output size:      {}",
+            format_bytes(avg_data_size as usize)
+        );
+        println!("  compression:      {:.2}x", compression_ratio);
+
+        if !read_times.is_empty() {
+            let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
+            let read_throughput_bytes = avg_data_size as f64 / avg_read;
+
+            println!("Read:");
+            println!("  avg time:         {:.3}s", avg_read);
+            if read_times.len() > 1 {
+                let min = read_times.iter().cloned().fold(f64::INFINITY, f64::min);
+                let max = read_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+                println!("  min/max:          {:.3}s / {:.3}s", min, max);
+            }
+            println!(
+                "  throughput:       {}/s (from compressed)",
+                format_bytes(read_throughput_bytes as usize)
+            );
+        }
+    }
+
+    // Cleanup
+    let _ = fs::remove_file(&data_file);
+    let _ = fs::remove_file(&index_file);
+}
+
+fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
+    let mut batches = Vec::new();
+
+    let paths = if path.is_dir() {
+        let mut files: Vec<PathBuf> = fs::read_dir(path)
+            .expect("Failed to read input directory")
+            .filter_map(|entry| {
+                let entry = entry.ok()?;
+                let p = entry.path();
+                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
+                    Some(p)
+                } else {
+                    None
+                }
+            })
+            .collect();
+        files.sort();
+        if files.is_empty() {
+            panic!("No .parquet files found in {}", path.display());
+        }
+        files
+    } else {
+        vec![path.clone()]
+    };
+
+    for file_path in &paths {
+        let file = fs::File::open(file_path)
+            .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
+            panic!(
+                "Failed to read Parquet metadata from {}: {}",
+                file_path.display(),
+                e
+            )
+        });
+        let reader = builder
+            .with_batch_size(batch_size)
+            .build()
+            .unwrap_or_else(|e| {
+                panic!(
+                    "Failed to build Parquet reader for {}: {}",
+                    file_path.display(),
+                    e
+                )
+            });
+        for batch_result in reader {
+            let batch = batch_result.unwrap_or_else(|e| {
+                panic!("Failed to read batch from {}: {}", file_path.display(), e)
+            });
+            if batch.num_rows() > 0 {
+                batches.push(batch);
+            }
+        }
+    }
+
+    if batches.is_empty() {
+        panic!("No data read from input");
+    }
+
+    println!(
+        "Loaded {} batches from {} file(s)",
+        batches.len(),
+        paths.len()
+    );
+    batches
+}
+
+fn generate_data(args: &Args) -> Vec<RecordBatch> {
+    let mut fields = Vec::new();
+    let mut col_idx = 0;
+
+    // Int64 columns
+    for _ in 0..args.gen_int_cols {
+        fields.push(Field::new(
+            format!("int_col_{col_idx}"),
+            DataType::Int64,
+            true,
+        ));
+        col_idx += 1;
+    }
+    // String columns
+    for _ in 0..args.gen_string_cols {
+        fields.push(Field::new(
+            format!("str_col_{col_idx}"),
+            DataType::Utf8,
+            true,
+        ));
+        col_idx += 1;
+    }
+    // Decimal columns
+    for _ in 0..args.gen_decimal_cols {
+        fields.push(Field::new(
+            format!("dec_col_{col_idx}"),
+            DataType::Decimal128(18, 2),
+            true,
+        ));
+        col_idx += 1;
+    }
+    // Date columns
+    for _ in 0..args.gen_date_cols {
+        fields.push(Field::new(
+            format!("date_col_{col_idx}"),
+            DataType::Date32,
+            true,
+        ));
+        col_idx += 1;
+    }
+
+    let schema = Arc::new(Schema::new(fields));
+    let mut batches = Vec::new();
+    let mut rng = rand::rng();
+    let mut remaining = args.gen_rows;
+
+    while remaining > 0 {
+        let batch_rows = remaining.min(args.batch_size);
+        remaining -= batch_rows;
+
+        let mut columns: Vec<Arc<dyn arrow::array::Array>> = Vec::new();
+
+        // Int64 columns
+        for _ in 0..args.gen_int_cols {
+            let mut builder = Int64Builder::with_capacity(batch_rows);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..1_000_000i64));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // String columns
+        for _ in 0..args.gen_string_cols {
+            let mut builder =
+                StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    let len = rng.random_range(1..args.gen_avg_string_len * 2);
+                    let s: String = (0..len)
+                        .map(|_| rng.random_range(b'a'..=b'z') as char)
+                        .collect();
+                    builder.append_value(&s);
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // Decimal columns
+        for _ in 0..args.gen_decimal_cols {
+            let mut builder = Decimal128Builder::with_capacity(batch_rows)
+                .with_precision_and_scale(18, 2)
+                .unwrap();
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..100_000_000i128));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // Date columns
+        for _ in 0..args.gen_date_cols {
+            let mut builder = Date32Builder::with_capacity(batch_rows);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..20000i32));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+
+        let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap();
+        batches.push(batch);
+    }
+
+    println!(
+        "Generated {} batches ({} rows)",
+        batches.len(),
+        args.gen_rows
+    );
+    batches
+}
+
+fn run_shuffle_write(
+    batches: &[RecordBatch],
+    schema: &SchemaRef,
+    codec: &CompressionCodec,
+    hash_col_indices: &[usize],
+    args: &Args,
+    data_file: &str,
+    index_file: &str,
+) -> f64 {
+    let partitioning = build_partitioning(
+        &args.partitioning,
+        args.partitions,
+        hash_col_indices,
+        schema,
+    );
+
+    let partitions = &[batches.to_vec()];
+    let exec = ShuffleWriterExec::try_new(
+        Arc::new(DataSourceExec::new(Arc::new(
+            MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(),
+        ))),
+        partitioning,
+        codec.clone(),
+        data_file.to_string(),
+        index_file.to_string(),
+        false,
+        args.write_buffer_size,
+    )
+    .expect("Failed to create ShuffleWriterExec");
+
+    let config = SessionConfig::new().with_batch_size(args.batch_size);
+    let mut runtime_builder = RuntimeEnvBuilder::new();
+    if let Some(mem_limit) = args.memory_limit {
+        runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+    }
+    let runtime_env = Arc::new(runtime_builder.build().unwrap());
+    let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+    let task_ctx = ctx.task_ctx();
+
+    let start = Instant::now();
+    let stream = exec.execute(0, task_ctx).unwrap();
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(collect(stream)).unwrap();
+    start.elapsed().as_secs_f64()
+}
+
+fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
+    let start = Instant::now();
+
+    // Read index file to get partition offsets
+    let index_bytes = fs::read(index_file).expect("Failed to read index file");
+    let num_offsets = index_bytes.len() / 8;
+    let offsets: Vec<i64> = (0..num_offsets)
+        .map(|i| {
+            let bytes: [u8; 8] = index_bytes[i * 8..(i + 1) * 8].try_into().unwrap();
+            i64::from_le_bytes(bytes)
+        })
+        .collect();
+
+    // Read data file
+    let data_bytes = fs::read(data_file).expect("Failed to read data file");
+
+    let mut total_rows = 0usize;
+    let mut total_batches = 0usize;
+
+    // Decode each partition's data
+    for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) {
+        let start_offset = offsets[p] as usize;
+        let end_offset = offsets[p + 1] as usize;
+
+        if start_offset >= end_offset {
+            continue; // Empty partition
+        }
+
+        // Read all IPC blocks within this partition
+        let mut offset = start_offset;
+        while offset < end_offset {
+            // First 8 bytes: IPC length
+            let ipc_length =
+                u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize;
+
+            // Skip 8-byte length prefix, then 8 bytes of field_count + codec header
+            let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length];
+            let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block");
+            total_rows += batch.num_rows();
+            total_batches += 1;
+
+            offset += 8 + ipc_length;
+        }
+    }
+
+    let elapsed = start.elapsed().as_secs_f64();
+    eprintln!(
+        "    read back {} rows in {} batches from {} partitions",
+        format_number(total_rows),
+        total_batches,
+        num_partitions
+    );
+    elapsed
+}
+
+fn build_partitioning(
+    scheme: &str,
+    num_partitions: usize,
+    hash_col_indices: &[usize],
+    schema: &SchemaRef,
+) -> CometPartitioning {
+    match scheme {
+        "single" => CometPartitioning::SinglePartition,
+        "round-robin" => CometPartitioning::RoundRobin(num_partitions, 0),
+        "hash" => {
+            let exprs: Vec<Arc<dyn datafusion::physical_expr::PhysicalExpr>> = hash_col_indices
+                .iter()
+                .map(|&idx| {
+                    let field = schema.field(idx);
+                    Arc::new(Column::new(field.name(), idx))
+                        as Arc<dyn datafusion::physical_expr::PhysicalExpr>
+                })
+                .collect();
+            CometPartitioning::Hash(exprs, num_partitions)
+        }
+        other => {
+            eprintln!("Unknown partitioning scheme: {other}. Using hash.");
+            build_partitioning("hash", num_partitions, hash_col_indices, schema)
+        }
+    }
+}
+
+fn parse_codec(codec: &str, zstd_level: i32) -> CompressionCodec {
+    match codec.to_lowercase().as_str() {
+        "none" => CompressionCodec::None,
+        "lz4" => CompressionCodec::Lz4Frame,
+        "zstd" => CompressionCodec::Zstd(zstd_level),
+        "snappy" => CompressionCodec::Snappy,
+        other => {
+            eprintln!("Unknown codec: {other}. Using zstd.");
+            CompressionCodec::Zstd(zstd_level)
+        }
+    }
+}
+
+fn parse_hash_columns(s: &str) -> Vec<usize> {
+    s.split(',')
+        .filter(|s| !s.is_empty())
+        .map(|s| s.trim().parse::<usize>().expect("Invalid column index"))
+        .collect()
+}
+
+fn describe_schema(schema: &Schema) -> String {
+    let mut counts = std::collections::HashMap::new();
+    for field in schema.fields() {
+        let type_name = match field.data_type() {
+            DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64 => "int",
+            DataType::Float16 | DataType::Float32 | DataType::Float64 => "float",
+            DataType::Utf8 | DataType::LargeUtf8 => "string",
+            DataType::Boolean => "bool",
+            DataType::Date32 | DataType::Date64 => "date",
+            DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => "decimal",
+            DataType::Timestamp(_, _) => "timestamp",
+            DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => "binary",
+            _ => "other",
+        };
+        *counts.entry(type_name).or_insert(0) += 1;
+    }
+    let mut parts: Vec<String> = counts
+        .into_iter()
+        .map(|(k, v)| format!("{}x{}", v, k))
+        .collect();
+    parts.sort();
+    parts.join(", ")
+}
+
+fn format_number(n: usize) -> String {
+    let s = n.to_string();
+    let mut result = String::new();
+    for (i, c) in s.chars().rev().enumerate() {
+        if i > 0 && i % 3 == 0 {
+            result.push(',');
+        }
+        result.push(c);
+    }
+    result.chars().rev().collect()
+}
+
+fn format_bytes(bytes: usize) -> String {
+    if bytes >= 1024 * 1024 * 1024 {
+        format!("{:.2} GiB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
+    } else if bytes >= 1024 * 1024 {
+        format!("{:.2} MiB", bytes as f64 / (1024.0 * 1024.0))
+    } else if bytes >= 1024 {
+        format!("{:.2} KiB", bytes as f64 / 1024.0)
+    } else {
+        format!("{bytes} B")
+    }
+}

From 9b5b305cb125a76e22f55ac61ee4994ff9e9f484 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 09:05:16 -0600
Subject: [PATCH 02/18] feat: add --limit option to shuffle benchmark (default
 1M rows)

---
 native/core/src/bin/shuffle_bench.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
index c1498161f7..9b963c5803 100644
--- a/native/core/src/bin/shuffle_bench.rs
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -152,6 +152,10 @@ struct Args {
     /// Write buffer size in bytes
     #[arg(long, default_value_t = 1048576)]
     write_buffer_size: usize,
+
+    /// Maximum number of rows to use (default: 1,000,000)
+    #[arg(long, default_value_t = 1_000_000)]
+    limit: usize,
 }
 
 fn main() {
@@ -178,6 +182,26 @@ fn main() {
     };
     let load_elapsed = load_start.elapsed();
 
+    // Apply row limit
+    let batches = {
+        let mut limited = Vec::new();
+        let mut rows_so_far = 0usize;
+        for batch in batches {
+            if rows_so_far >= args.limit {
+                break;
+            }
+            let remaining = args.limit - rows_so_far;
+            if batch.num_rows() <= remaining {
+                rows_so_far += batch.num_rows();
+                limited.push(batch);
+            } else {
+                limited.push(batch.slice(0, remaining));
+                rows_so_far += remaining;
+            }
+        }
+        limited
+    };
+
     let schema = batches[0].schema();
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();

From e1ab490c57b04387e4885284811ccf2ad993f4a6 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 09:19:01 -0600
Subject: [PATCH 03/18] perf: apply limit during parquet read to avoid scanning
 all files

---
 native/core/src/bin/shuffle_bench.rs | 43 ++++++++++++----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
index 9b963c5803..17b1a9a6ff 100644
--- a/native/core/src/bin/shuffle_bench.rs
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -176,32 +176,12 @@ fn main() {
     // Load data
     let load_start = Instant::now();
     let batches = if let Some(ref input_path) = args.input {
-        load_parquet(input_path, args.batch_size)
+        load_parquet(input_path, args.batch_size, args.limit)
     } else {
         generate_data(&args)
     };
     let load_elapsed = load_start.elapsed();
 
-    // Apply row limit
-    let batches = {
-        let mut limited = Vec::new();
-        let mut rows_so_far = 0usize;
-        for batch in batches {
-            if rows_so_far >= args.limit {
-                break;
-            }
-            let remaining = args.limit - rows_so_far;
-            if batch.num_rows() <= remaining {
-                rows_so_far += batch.num_rows();
-                limited.push(batch);
-            } else {
-                limited.push(batch.slice(0, remaining));
-                rows_so_far += remaining;
-            }
-        }
-        limited
-    };
-
     let schema = batches[0].schema();
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
@@ -358,8 +338,9 @@ fn main() {
     let _ = fs::remove_file(&index_file);
 }
 
-fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
+fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBatch> {
     let mut batches = Vec::new();
+    let mut total_rows = 0usize;
 
     let paths = if path.is_dir() {
         let mut files: Vec<PathBuf> = fs::read_dir(path)
@@ -383,7 +364,7 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
         vec![path.clone()]
     };
 
-    for file_path in &paths {
+    'outer: for file_path in &paths {
         let file = fs::File::open(file_path)
             .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
@@ -407,8 +388,19 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
             let batch = batch_result.unwrap_or_else(|e| {
                 panic!("Failed to read batch from {}: {}", file_path.display(), e)
             });
-            if batch.num_rows() > 0 {
+            if batch.num_rows() == 0 {
+                continue;
+            }
+            let remaining = limit - total_rows;
+            if batch.num_rows() <= remaining {
+                total_rows += batch.num_rows();
                 batches.push(batch);
+            } else {
+                batches.push(batch.slice(0, remaining));
+                total_rows += remaining;
+            }
+            if total_rows >= limit {
+                break 'outer;
             }
         }
     }
@@ -418,8 +410,9 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
     }
 
     println!(
-        "Loaded {} batches from {} file(s)",
+        "Loaded {} batches ({} rows) from {} file(s)",
         batches.len(),
+        format_number(total_rows),
         paths.len()
     );
     batches

From ca36cbd579d5c512e9617eb54cd052ec0d2c244d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 23 Mar 2026 06:39:47 -0700
Subject: [PATCH 04/18] chore: add comment explaining parquet/rand deps in
 shuffle crate

---
 native/shuffle/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml
index 1a64a36ff2..1337ea3b15 100644
--- a/native/shuffle/Cargo.toml
+++ b/native/shuffle/Cargo.toml
@@ -43,6 +43,7 @@ itertools = "0.14.0"
 jni = "0.21"
 log = "0.4"
 lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] }
+# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature)
 parquet = { workspace = true, optional = true }
 rand = { workspace = true, optional = true }
 simd-adler32 = "0.3.7"

From 6e8bed28902e7256e1e1b3e914439e0a5abf3622 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 26 Mar 2026 07:13:41 -0700
Subject: [PATCH 05/18] perf: add max_buffered_batches config and stream
 shuffle bench from parquet

- Add `spark.comet.exec.shuffle.maxBufferedBatches` config to limit
  the number of batches buffered before spilling, allowing earlier
  spilling to reduce peak memory usage on executors
- Fix too-many-open-files: close spill file FD after each spill and
  reopen in append mode, rather than holding one FD open per partition
- Refactor shuffle_bench to stream directly from Parquet instead of
  loading all input data into memory; remove synthetic data generation
- Add --max-buffered-batches CLI arg to shuffle_bench
- Add shuffle benchmark documentation to README
---
 .../scala/org/apache/comet/CometConf.scala    |  12 +
 native/core/src/execution/planner.rs          |   2 +
 native/proto/src/proto/operator.proto         |   3 +
 native/shuffle/README.md                      |  43 ++
 native/shuffle/benches/shuffle_writer.rs      |   1 +
 native/shuffle/src/bin/shuffle_bench.rs       | 432 +++++-------------
 .../src/partitioners/multi_partition.rs       |  11 +
 native/shuffle/src/shuffle_writer.rs          |  11 +
 .../shuffle/src/writers/partition_writer.rs   |  29 +-
 .../shuffle/CometNativeShuffleWriter.scala    |   2 +
 10 files changed, 209 insertions(+), 337 deletions(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index bfe90181ff..2ad9c1f609 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -534,6 +534,18 @@ object CometConf extends ShimCometConf {
       .checkValue(v => v > 0, "Write buffer size must be positive")
       .createWithDefault(1)
 
+  val COMET_SHUFFLE_MAX_BUFFERED_BATCHES: ConfigEntry[Int] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.maxBufferedBatches")
+      .category(CATEGORY_SHUFFLE)
+      .doc("Maximum number of batches to buffer in memory before spilling to disk during " +
+        "native shuffle. Setting this to a small value causes earlier spilling, which reduces " +
+        "peak memory usage on executors at the cost of more disk I/O. " +
+        "The default value of 0 disables this limit and spills only when the memory pool is " +
+        "exhausted.")
+      .intConf
+      .checkValue(v => v >= 0, "Max buffered batches must be non-negative")
+      .createWithDefault(0)
+
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
     "spark.comet.shuffle.preferDictionary.ratio")
     .category(CATEGORY_SHUFFLE)
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 5af31fcc22..b6e05fccd1 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1352,6 +1352,7 @@ impl PhysicalPlanner {
                 }?;
 
                 let write_buffer_size = writer.write_buffer_size as usize;
+                let max_buffered_batches = writer.max_buffered_batches as usize;
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
                     Arc::clone(&child.native_plan),
                     partitioning,
@@ -1360,6 +1361,7 @@ impl PhysicalPlanner {
                     writer.output_index_file.clone(),
                     writer.tracing_enabled,
                     write_buffer_size,
+                    max_buffered_batches,
                 )?);
 
                 Ok((
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 344b9f0f21..5e23aad061 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -294,6 +294,9 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Maximum number of batches to buffer before spilling to disk.
+  // 0 means no limit (spill only when memory pool is exhausted).
+  int32 max_buffered_batches = 9;
 }
 
 message ParquetWriter {
diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 8fba6b0323..0333ddbe8e 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -23,3 +23,46 @@ This crate provides the shuffle writer and reader implementation for Apache Data
 of the [Apache DataFusion Comet] subproject.
 
 [Apache DataFusion Comet]: https://github.com/apache/datafusion-comet/
+
+## Shuffle Benchmark Tool
+
+A standalone benchmark binary (`shuffle_bench`) is included for profiling shuffle write and read
+performance outside of Spark. It streams input data directly from Parquet files.
+
+### Basic usage
+
+```sh
+cargo run --release --features shuffle-bench --bin shuffle_bench -- \
+  --input /data/tpch-sf100/lineitem/ \
+  --partitions 200 \
+  --codec zstd --zstd-level 1 \
+  --hash-columns 0,3
+```
+
+### Options
+
+| Option | Default | Description |
+|---|---|---|
+| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files |
+| `--partitions` | `200` | Number of output shuffle partitions |
+| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` |
+| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) |
+| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` |
+| `--zstd-level` | `1` | Zstd compression level (1–22) |
+| `--batch-size` | `8192` | Batch size for reading Parquet data |
+| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded |
+| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) |
+| `--write-buffer-size` | `1048576` | Write buffer size in bytes |
+| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) |
+| `--iterations` | `1` | Number of timed iterations |
+| `--warmup` | `0` | Number of warmup iterations before timing |
+| `--read-back` | `false` | Also benchmark reading back the shuffle output |
+| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files |
+
+### Profiling with flamegraph
+
+```sh
+cargo flamegraph --release --features shuffle-bench --bin shuffle_bench -- \
+  --input /data/tpch-sf100/lineitem/ \
+  --partitions 200 --codec zstd --zstd-level 1
+```
diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs
index 27abd919fa..8ff1f024d5 100644
--- a/native/shuffle/benches/shuffle_writer.rs
+++ b/native/shuffle/benches/shuffle_writer.rs
@@ -153,6 +153,7 @@ fn create_shuffle_writer_exec(
         "/tmp/index.out".to_string(),
         false,
         1024 * 1024,
+        0, // max_buffered_batches: no limit
     )
     .unwrap()
 }
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 373d0f92b9..0f04954ffa 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -16,11 +16,10 @@
 // under the License.
 
 //! Standalone shuffle benchmark tool for profiling Comet shuffle write and read
-//! outside of Spark.
+//! outside of Spark. Streams input directly from Parquet files.
 //!
 //! # Usage
 //!
-//! Read from Parquet files (e.g. TPC-H lineitem):
 //! ```sh
 //! cargo run --release --bin shuffle_bench -- \
 //!   --input /data/tpch-sf100/lineitem/ \
@@ -30,14 +29,6 @@
 //!   --read-back
 //! ```
 //!
-//! Generate synthetic data:
-//! ```sh
-//! cargo run --release --bin shuffle_bench -- \
-//!   --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \
-//!   --gen-decimal-cols 2 --gen-avg-string-len 32 \
-//!   --partitions 200 --codec lz4 --read-back
-//! ```
-//!
 //! Profile with flamegraph:
 //! ```sh
 //! cargo flamegraph --release --bin shuffle_bench -- \
@@ -45,23 +36,19 @@
 //!   --partitions 200 --codec zstd --zstd-level 1
 //! ```
 
-use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder};
-use arrow::array::RecordBatch;
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::datatypes::{DataType, SchemaRef};
 use clap::Parser;
-use datafusion::datasource::memory::MemorySourceConfig;
-use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::config::SessionConfig;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::common::collect;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
     read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
 };
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
-use rand::RngExt;
 use std::fs;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -75,37 +62,9 @@ use std::time::Instant;
 struct Args {
     /// Path to input Parquet file or directory of Parquet files
     #[arg(long)]
-    input: Option<PathBuf>,
-
-    /// Generate synthetic data instead of reading from Parquet
-    #[arg(long, default_value_t = false)]
-    generate: bool,
-
-    /// Number of rows to generate (requires --generate)
-    #[arg(long, default_value_t = 1_000_000)]
-    gen_rows: usize,
-
-    /// Number of Int64 columns to generate
-    #[arg(long, default_value_t = 4)]
-    gen_int_cols: usize,
+    input: PathBuf,
 
-    /// Number of Utf8 string columns to generate
-    #[arg(long, default_value_t = 2)]
-    gen_string_cols: usize,
-
-    /// Number of Decimal128 columns to generate
-    #[arg(long, default_value_t = 2)]
-    gen_decimal_cols: usize,
-
-    /// Number of Date32 columns to generate
-    #[arg(long, default_value_t = 1)]
-    gen_date_cols: usize,
-
-    /// Average string length for generated string columns
-    #[arg(long, default_value_t = 24)]
-    gen_avg_string_len: usize,
-
-    /// Batch size for reading Parquet or generating data
+    /// Batch size for reading Parquet data
     #[arg(long, default_value_t = 8192)]
     batch_size: usize,
 
@@ -153,70 +112,37 @@ struct Args {
     #[arg(long, default_value_t = 1048576)]
     write_buffer_size: usize,
 
-    /// Maximum number of rows to use (default: 1,000,000)
-    #[arg(long, default_value_t = 1_000_000)]
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    #[arg(long, default_value_t = 0)]
+    max_buffered_batches: usize,
+
+    /// Limit rows processed per iteration (0 = no limit)
+    #[arg(long, default_value_t = 0)]
     limit: usize,
 }
 
 fn main() {
     let args = Args::parse();
 
-    // Validate args
-    if args.input.is_none() && !args.generate {
-        eprintln!("Error: must specify either --input <path> or --generate");
-        std::process::exit(1);
-    }
-
     // Create output directory
     fs::create_dir_all(&args.output_dir).expect("Failed to create output directory");
-
     let data_file = args.output_dir.join("data.out");
     let index_file = args.output_dir.join("index.out");
 
-    // Load data
-    let load_start = Instant::now();
-    let batches = if let Some(ref input_path) = args.input {
-        load_parquet(input_path, args.batch_size, args.limit)
-    } else {
-        generate_data(&args)
-    };
-    let load_elapsed = load_start.elapsed();
+    let (schema, total_rows) = read_parquet_metadata(&args.input, args.limit);
 
-    let schema = batches[0].schema();
-    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
-    let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
+    let codec = parse_codec(&args.codec, args.zstd_level);
+    let hash_col_indices = parse_hash_columns(&args.hash_columns);
 
     println!("=== Shuffle Benchmark ===");
+    println!("Input:          {}", args.input.display());
     println!(
-        "Data source:    {}",
-        if args.input.is_some() {
-            "parquet"
-        } else {
-            "generated"
-        }
-    );
-    println!(
-        "Schema:         {} columns ({} fields)",
+        "Schema:         {} columns ({})",
         schema.fields().len(),
         describe_schema(&schema)
     );
-    println!("Total rows:     {}", format_number(total_rows));
-    println!("Total size:     {}", format_bytes(total_bytes));
-    println!("Batches:        {}", batches.len());
-    println!(
-        "Rows/batch:     ~{}",
-        if batches.is_empty() {
-            0
-        } else {
-            total_rows / batches.len()
-        }
-    );
-    println!("Load time:      {:.3}s", load_elapsed.as_secs_f64());
-    println!();
-
-    let codec = parse_codec(&args.codec, args.zstd_level);
-    let hash_col_indices = parse_hash_columns(&args.hash_columns);
-
+    println!("Total rows:     {}", format_number(total_rows as usize));
+    println!("Batch size:     {}", format_number(args.batch_size));
     println!("Partitioning:   {}", args.partitioning);
     println!("Partitions:     {}", args.partitions);
     println!("Codec:          {:?}", codec);
@@ -224,13 +150,15 @@ fn main() {
     if let Some(mem_limit) = args.memory_limit {
         println!("Memory limit:   {}", format_bytes(mem_limit));
     }
+    if args.max_buffered_batches > 0 {
+        println!("Max buf batches:{}", args.max_buffered_batches);
+    }
     println!(
         "Iterations:     {} (warmup: {})",
         args.iterations, args.warmup
     );
     println!();
 
-    // Run warmup + timed iterations
     let total_iters = args.warmup + args.iterations;
     let mut write_times = Vec::with_capacity(args.iterations);
     let mut read_times = Vec::with_capacity(args.iterations);
@@ -244,9 +172,8 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        // Write phase
         let write_elapsed = run_shuffle_write(
-            &batches,
+            &args.input,
             &schema,
             &codec,
             &hash_col_indices,
@@ -264,7 +191,6 @@ fn main() {
         print!("  [{label}] write: {:.3}s", write_elapsed);
         print!("  output: {}", format_bytes(data_size as usize));
 
-        // Read phase
         if args.read_back {
             let read_elapsed = run_shuffle_read(
                 data_file.to_str().unwrap(),
@@ -279,7 +205,6 @@ fn main() {
         println!();
     }
 
-    // Print summary
     if args.iterations > 0 {
         println!();
         println!("=== Results ===");
@@ -287,12 +212,6 @@ fn main() {
         let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
         let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
         let write_throughput_rows = total_rows as f64 / avg_write;
-        let write_throughput_bytes = total_bytes as f64 / avg_write;
-        let compression_ratio = if avg_data_size > 0 {
-            total_bytes as f64 / avg_data_size as f64
-        } else {
-            0.0
-        };
 
         println!("Write:");
         println!("  avg time:         {:.3}s", avg_write);
@@ -305,15 +224,13 @@ fn main() {
             println!("  min/max:          {:.3}s / {:.3}s", min, max);
         }
         println!(
-            "  throughput:       {}/s ({} rows/s)",
-            format_bytes(write_throughput_bytes as usize),
+            "  throughput:       {} rows/s",
             format_number(write_throughput_rows as usize)
         );
         println!(
             "  output size:      {}",
             format_bytes(avg_data_size as usize)
         );
-        println!("  compression:      {:.2}x", compression_ratio);
 
         if !read_times.is_empty() {
             let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
@@ -333,38 +250,17 @@ fn main() {
         }
     }
 
-    // Cleanup
     let _ = fs::remove_file(&data_file);
     let _ = fs::remove_file(&index_file);
 }
 
-fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBatch> {
-    let mut batches = Vec::new();
-    let mut total_rows = 0usize;
+/// Read schema and total row count from Parquet metadata without loading any data.
+fn read_parquet_metadata(path: &PathBuf, limit: usize) -> (SchemaRef, u64) {
+    let paths = collect_parquet_paths(path);
+    let mut schema = None;
+    let mut total_rows = 0u64;
 
-    let paths = if path.is_dir() {
-        let mut files: Vec<PathBuf> = fs::read_dir(path)
-            .expect("Failed to read input directory")
-            .filter_map(|entry| {
-                let entry = entry.ok()?;
-                let p = entry.path();
-                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
-                    Some(p)
-                } else {
-                    None
-                }
-            })
-            .collect();
-        files.sort();
-        if files.is_empty() {
-            panic!("No .parquet files found in {}", path.display());
-        }
-        files
-    } else {
-        vec![path.clone()]
-    };
-
-    'outer: for file_path in &paths {
+    for file_path in &paths {
         let file = fs::File::open(file_path)
             .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
@@ -374,172 +270,44 @@ fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBa
                 e
             )
         });
-        let reader = builder
-            .with_batch_size(batch_size)
-            .build()
-            .unwrap_or_else(|e| {
-                panic!(
-                    "Failed to build Parquet reader for {}: {}",
-                    file_path.display(),
-                    e
-                )
-            });
-        for batch_result in reader {
-            let batch = batch_result.unwrap_or_else(|e| {
-                panic!("Failed to read batch from {}: {}", file_path.display(), e)
-            });
-            if batch.num_rows() == 0 {
-                continue;
-            }
-            let remaining = limit - total_rows;
-            if batch.num_rows() <= remaining {
-                total_rows += batch.num_rows();
-                batches.push(batch);
-            } else {
-                batches.push(batch.slice(0, remaining));
-                total_rows += remaining;
-            }
-            if total_rows >= limit {
-                break 'outer;
-            }
+        if schema.is_none() {
+            schema = Some(Arc::clone(builder.schema()));
+        }
+        total_rows += builder.metadata().file_metadata().num_rows() as u64;
+        if limit > 0 && total_rows >= limit as u64 {
+            total_rows = total_rows.min(limit as u64);
+            break;
         }
     }
 
-    if batches.is_empty() {
-        panic!("No data read from input");
-    }
-
-    println!(
-        "Loaded {} batches ({} rows) from {} file(s)",
-        batches.len(),
-        format_number(total_rows),
-        paths.len()
-    );
-    batches
+    (schema.expect("No parquet files found"), total_rows)
 }
 
-fn generate_data(args: &Args) -> Vec<RecordBatch> {
-    let mut fields = Vec::new();
-    let mut col_idx = 0;
-
-    // Int64 columns
-    for _ in 0..args.gen_int_cols {
-        fields.push(Field::new(
-            format!("int_col_{col_idx}"),
-            DataType::Int64,
-            true,
-        ));
-        col_idx += 1;
-    }
-    // String columns
-    for _ in 0..args.gen_string_cols {
-        fields.push(Field::new(
-            format!("str_col_{col_idx}"),
-            DataType::Utf8,
-            true,
-        ));
-        col_idx += 1;
-    }
-    // Decimal columns
-    for _ in 0..args.gen_decimal_cols {
-        fields.push(Field::new(
-            format!("dec_col_{col_idx}"),
-            DataType::Decimal128(18, 2),
-            true,
-        ));
-        col_idx += 1;
-    }
-    // Date columns
-    for _ in 0..args.gen_date_cols {
-        fields.push(Field::new(
-            format!("date_col_{col_idx}"),
-            DataType::Date32,
-            true,
-        ));
-        col_idx += 1;
-    }
-
-    let schema = Arc::new(Schema::new(fields));
-    let mut batches = Vec::new();
-    let mut rng = rand::rng();
-    let mut remaining = args.gen_rows;
-
-    while remaining > 0 {
-        let batch_rows = remaining.min(args.batch_size);
-        remaining -= batch_rows;
-
-        let mut columns: Vec<Arc<dyn arrow::array::Array>> = Vec::new();
-
-        // Int64 columns
-        for _ in 0..args.gen_int_cols {
-            let mut builder = Int64Builder::with_capacity(batch_rows);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    builder.append_value(rng.random_range(0..1_000_000i64));
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // String columns
-        for _ in 0..args.gen_string_cols {
-            let mut builder =
-                StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    let len = rng.random_range(1..args.gen_avg_string_len * 2);
-                    let s: String = (0..len)
-                        .map(|_| rng.random_range(b'a'..=b'z') as char)
-                        .collect();
-                    builder.append_value(&s);
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // Decimal columns
-        for _ in 0..args.gen_decimal_cols {
-            let mut builder = Decimal128Builder::with_capacity(batch_rows)
-                .with_precision_and_scale(18, 2)
-                .unwrap();
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    builder.append_value(rng.random_range(0..100_000_000i128));
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // Date columns
-        for _ in 0..args.gen_date_cols {
-            let mut builder = Date32Builder::with_capacity(batch_rows);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
+fn collect_parquet_paths(path: &PathBuf) -> Vec<PathBuf> {
+    if path.is_dir() {
+        let mut files: Vec<PathBuf> = fs::read_dir(path)
+            .unwrap_or_else(|e| panic!("Failed to read directory {}: {}", path.display(), e))
+            .filter_map(|entry| {
+                let p = entry.ok()?.path();
+                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
+                    Some(p)
                 } else {
-                    builder.append_value(rng.random_range(0..20000i32));
+                    None
                 }
-            }
-            columns.push(Arc::new(builder.finish()));
+            })
+            .collect();
+        files.sort();
+        if files.is_empty() {
+            panic!("No .parquet files found in {}", path.display());
         }
-
-        let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap();
-        batches.push(batch);
+        files
+    } else {
+        vec![path.clone()]
     }
-
-    println!(
-        "Generated {} batches ({} rows)",
-        batches.len(),
-        args.gen_rows
-    );
-    batches
 }
 
 fn run_shuffle_write(
-    batches: &[RecordBatch],
+    input_path: &PathBuf,
     schema: &SchemaRef,
     codec: &CompressionCodec,
     hash_col_indices: &[usize],
@@ -554,40 +322,61 @@ fn run_shuffle_write(
         schema,
     );
 
-    let partitions = &[batches.to_vec()];
-    let exec = ShuffleWriterExec::try_new(
-        Arc::new(DataSourceExec::new(Arc::new(
-            MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(),
-        ))),
-        partitioning,
-        codec.clone(),
-        data_file.to_string(),
-        index_file.to_string(),
-        false,
-        args.write_buffer_size,
-    )
-    .expect("Failed to create ShuffleWriterExec");
-
-    let config = SessionConfig::new().with_batch_size(args.batch_size);
-    let mut runtime_builder = RuntimeEnvBuilder::new();
-    if let Some(mem_limit) = args.memory_limit {
-        runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
-    }
-    let runtime_env = Arc::new(runtime_builder.build().unwrap());
-    let ctx = SessionContext::new_with_config_rt(config, runtime_env);
-    let task_ctx = ctx.task_ctx();
-
-    let start = Instant::now();
-    let stream = exec.execute(0, task_ctx).unwrap();
     let rt = tokio::runtime::Runtime::new().unwrap();
-    rt.block_on(collect(stream)).unwrap();
-    start.elapsed().as_secs_f64()
+    rt.block_on(async {
+        let config = SessionConfig::new().with_batch_size(args.batch_size);
+        let mut runtime_builder = RuntimeEnvBuilder::new();
+        if let Some(mem_limit) = args.memory_limit {
+            runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+        }
+        let runtime_env = Arc::new(runtime_builder.build().unwrap());
+        let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+
+        let path_str = input_path.to_str().unwrap();
+        let mut df = ctx
+            .read_parquet(path_str, ParquetReadOptions::default())
+            .await
+            .expect("Failed to create Parquet scan");
+        if args.limit > 0 {
+            df = df.limit(0, Some(args.limit)).unwrap();
+        }
+
+        let parquet_plan = df
+            .create_physical_plan()
+            .await
+            .expect("Failed to create physical plan");
+
+        // ShuffleWriterExec reads from a single input partition
+        let input: Arc<dyn ExecutionPlan> =
+            if parquet_plan.properties().output_partitioning().partition_count() > 1 {
+                Arc::new(CoalescePartitionsExec::new(parquet_plan))
+            } else {
+                parquet_plan
+            };
+
+        let exec = ShuffleWriterExec::try_new(
+            input,
+            partitioning,
+            codec.clone(),
+            data_file.to_string(),
+            index_file.to_string(),
+            false,
+            args.write_buffer_size,
+            args.max_buffered_batches,
+        )
+        .expect("Failed to create ShuffleWriterExec");
+
+        let task_ctx = ctx.task_ctx();
+        let start = Instant::now();
+        let stream = exec.execute(0, task_ctx).unwrap();
+        collect(stream).await.unwrap();
+        start.elapsed().as_secs_f64()
+    })
 }
 
 fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
     let start = Instant::now();
 
-    // Read index file to get partition offsets
     let index_bytes = fs::read(index_file).expect("Failed to read index file");
     let num_offsets = index_bytes.len() / 8;
     let offsets: Vec<i64> = (0..num_offsets)
@@ -597,34 +386,27 @@ fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) ->
         })
         .collect();
 
-    // Read data file
     let data_bytes = fs::read(data_file).expect("Failed to read data file");
 
     let mut total_rows = 0usize;
     let mut total_batches = 0usize;
 
-    // Decode each partition's data
     for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) {
         let start_offset = offsets[p] as usize;
         let end_offset = offsets[p + 1] as usize;
 
         if start_offset >= end_offset {
-            continue; // Empty partition
+            continue;
         }
 
-        // Read all IPC blocks within this partition
         let mut offset = start_offset;
         while offset < end_offset {
-            // First 8 bytes: IPC length
             let ipc_length =
                 u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize;
-
-            // Skip 8-byte length prefix, then 8 bytes of field_count + codec header
             let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length];
             let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block");
             total_rows += batch.num_rows();
             total_batches += 1;
-
             offset += 8 + ipc_length;
         }
     }
@@ -686,7 +468,7 @@ fn parse_hash_columns(s: &str) -> Vec<usize> {
         .collect()
 }
 
-fn describe_schema(schema: &Schema) -> String {
+fn describe_schema(schema: &arrow::datatypes::Schema) -> String {
     let mut counts = std::collections::HashMap::new();
     for field in schema.fields() {
         let type_name = match field.data_type() {
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 42290c5510..4cc09e9679 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -124,6 +124,8 @@ pub(crate) struct MultiPartitionShuffleRepartitioner {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    max_buffered_batches: usize,
 }
 
 impl MultiPartitionShuffleRepartitioner {
@@ -140,6 +142,7 @@ impl MultiPartitionShuffleRepartitioner {
         codec: CompressionCodec,
         tracing_enabled: bool,
         write_buffer_size: usize,
+        max_buffered_batches: usize,
     ) -> datafusion::common::Result<Self> {
         let num_output_partitions = partitioning.partition_count();
         assert_ne!(
@@ -189,6 +192,7 @@ impl MultiPartitionShuffleRepartitioner {
             reservation,
             tracing_enabled,
             write_buffer_size,
+            max_buffered_batches,
         })
     }
 
@@ -397,6 +401,13 @@ impl MultiPartitionShuffleRepartitioner {
         partition_row_indices: &[u32],
         partition_starts: &[u32],
     ) -> datafusion::common::Result<()> {
+        // Spill before buffering if we've reached the configured batch count limit.
+        if self.max_buffered_batches > 0
+            && self.buffered_batches.len() >= self.max_buffered_batches
+        {
+            self.spill()?;
+        }
+
         let mut mem_growth: usize = input.get_array_memory_size();
         let buffered_partition_idx = self.buffered_batches.len() as u32;
         self.buffered_batches.push(input);
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index e649aaac69..95a09610a4 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -68,6 +68,8 @@ pub struct ShuffleWriterExec {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    max_buffered_batches: usize,
 }
 
 impl ShuffleWriterExec {
@@ -81,6 +83,7 @@ impl ShuffleWriterExec {
         output_index_file: String,
         tracing_enabled: bool,
         write_buffer_size: usize,
+        max_buffered_batches: usize,
     ) -> Result<Self> {
         let cache = PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&input.schema())),
@@ -99,6 +102,7 @@ impl ShuffleWriterExec {
             codec,
             tracing_enabled,
             write_buffer_size,
+            max_buffered_batches,
         })
     }
 }
@@ -163,6 +167,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                 self.output_index_file.clone(),
                 self.tracing_enabled,
                 self.write_buffer_size,
+                self.max_buffered_batches,
             )?)),
             _ => panic!("ShuffleWriterExec wrong number of children"),
         }
@@ -190,6 +195,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                     self.codec.clone(),
                     self.tracing_enabled,
                     self.write_buffer_size,
+                    self.max_buffered_batches,
                 )
                 .map_err(|e| ArrowError::ExternalError(Box::new(e))),
             )
@@ -210,6 +216,7 @@ async fn external_shuffle(
     codec: CompressionCodec,
     tracing_enabled: bool,
     write_buffer_size: usize,
+    max_buffered_batches: usize,
 ) -> Result<SendableRecordBatchStream> {
     with_trace_async("external_shuffle", tracing_enabled, || async {
         let schema = input.schema();
@@ -238,6 +245,7 @@ async fn external_shuffle(
                 codec,
                 tracing_enabled,
                 write_buffer_size,
+                max_buffered_batches,
             )?),
         };
 
@@ -362,6 +370,7 @@ mod test {
             CompressionCodec::Lz4Frame,
             false,
             1024 * 1024, // write_buffer_size: 1MB default
+            0,           // max_buffered_batches: no limit
         )
         .unwrap();
 
@@ -466,6 +475,7 @@ mod test {
                 "/tmp/index.out".to_string(),
                 false,
                 1024 * 1024, // write_buffer_size: 1MB default
+                0,           // max_buffered_batches: no limit
             )
             .unwrap();
 
@@ -525,6 +535,7 @@ mod test {
                 index_file.clone(),
                 false,
                 1024 * 1024,
+                0, // max_buffered_batches: no limit
             )
             .unwrap();
 
diff --git a/native/shuffle/src/writers/partition_writer.rs b/native/shuffle/src/writers/partition_writer.rs
index 48017871db..4de307de62 100644
--- a/native/shuffle/src/writers/partition_writer.rs
+++ b/native/shuffle/src/writers/partition_writer.rs
@@ -26,7 +26,6 @@ use std::fs::{File, OpenOptions};
 
 struct SpillFile {
     temp_file: RefCountedTempFile,
-    file: File,
 }
 
 pub(crate) struct PartitionWriter {
@@ -53,26 +52,28 @@ impl PartitionWriter {
         runtime: &RuntimeEnv,
     ) -> datafusion::common::Result<()> {
         if self.spill_file.is_none() {
-            // Spill file is not yet created, create it
             let spill_file = runtime
                 .disk_manager
                 .create_tmp_file("shuffle writer spill")?;
-            let spill_data = OpenOptions::new()
-                .write(true)
-                .create(true)
-                .truncate(true)
-                .open(spill_file.path())
-                .map_err(|e| {
-                    DataFusionError::Execution(format!("Error occurred while spilling {e}"))
-                })?;
+            // Create the file (truncating any pre-existing content)
+            File::create(spill_file.path()).map_err(|e| {
+                DataFusionError::Execution(format!("Error occurred while spilling {e}"))
+            })?;
             self.spill_file = Some(SpillFile {
                 temp_file: spill_file,
-                file: spill_data,
             });
         }
         Ok(())
     }
 
+    fn open_spill_file_for_append(&self) -> datafusion::common::Result<File> {
+        OpenOptions::new()
+            .write(true)
+            .append(true)
+            .open(self.spill_file.as_ref().unwrap().temp_file.path())
+            .map_err(|e| DataFusionError::Execution(format!("Error occurred while spilling {e}")))
+    }
+
     pub(crate) fn spill(
         &mut self,
         iter: &mut PartitionedBatchIterator,
@@ -84,10 +85,13 @@ impl PartitionWriter {
         if let Some(batch) = iter.next() {
             self.ensure_spill_file_created(runtime)?;
 
+            // Open the file for this spill and close it when done, so we don't
+            // hold open one FD per partition across multiple spill events.
+            let mut spill_data = self.open_spill_file_for_append()?;
             let total_bytes_written = {
                 let mut buf_batch_writer = BufBatchWriter::new(
                     &mut self.shuffle_block_writer,
-                    &mut self.spill_file.as_mut().unwrap().file,
+                    &mut spill_data,
                     write_buffer_size,
                     batch_size,
                 );
@@ -104,6 +108,7 @@ impl PartitionWriter {
                 buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?;
                 bytes_written
             };
+            // spill_data is dropped here, closing the file descriptor
 
             Ok(total_bytes_written)
         } else {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index 3fc222bd19..a80d8b2fa4 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -192,6 +192,8 @@ class CometNativeShuffleWriter[K, V](
         CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get)
       shuffleWriterBuilder.setWriteBufferSize(
         CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt)
+      shuffleWriterBuilder.setMaxBufferedBatches(
+        CometConf.COMET_SHUFFLE_MAX_BUFFERED_BATCHES.get())
 
       outputPartitioning match {
         case p if isSinglePartitioning(p) =>

From 2ef57e79d4e9202d4fb1a70f62f5d8b717ca67b4 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:05:23 -0600
Subject: [PATCH 06/18] cargo fmt

---
 native/shuffle/src/bin/shuffle_bench.rs | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 7f919e7674..ec1227a2ee 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -340,12 +340,16 @@ fn run_shuffle_write(
             .expect("Failed to create physical plan");
 
         // ShuffleWriterExec reads from a single input partition
-        let input: Arc<dyn ExecutionPlan> =
-            if parquet_plan.properties().output_partitioning().partition_count() > 1 {
-                Arc::new(CoalescePartitionsExec::new(parquet_plan))
-            } else {
-                parquet_plan
-            };
+        let input: Arc<dyn ExecutionPlan> = if parquet_plan
+            .properties()
+            .output_partitioning()
+            .partition_count()
+            > 1
+        {
+            Arc::new(CoalescePartitionsExec::new(parquet_plan))
+        } else {
+            parquet_plan
+        };
 
         let exec = ShuffleWriterExec::try_new(
             input,

From 9136e104e87cae0c8920fcce790064de5716d425 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:08:20 -0600
Subject: [PATCH 07/18] prettier

---
 native/shuffle/README.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 0333ddbe8e..74b8dbe656 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -41,23 +41,23 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 
 ### Options
 
-| Option | Default | Description |
-|---|---|---|
-| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files |
-| `--partitions` | `200` | Number of output shuffle partitions |
-| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` |
-| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) |
-| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` |
-| `--zstd-level` | `1` | Zstd compression level (1–22) |
-| `--batch-size` | `8192` | Batch size for reading Parquet data |
-| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded |
-| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) |
-| `--write-buffer-size` | `1048576` | Write buffer size in bytes |
-| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) |
-| `--iterations` | `1` | Number of timed iterations |
-| `--warmup` | `0` | Number of warmup iterations before timing |
-| `--read-back` | `false` | Also benchmark reading back the shuffle output |
-| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files |
+| Option                   | Default                    | Description                                                  |
+| ------------------------ | -------------------------- | ------------------------------------------------------------ |
+| `--input`                | _(required)_               | Path to a Parquet file or directory of Parquet files         |
+| `--partitions`           | `200`                      | Number of output shuffle partitions                          |
+| `--partitioning`         | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`         |
+| `--hash-columns`         | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`)       |
+| `--codec`                | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`           |
+| `--zstd-level`           | `1`                        | Zstd compression level (1–22)                                |
+| `--batch-size`           | `8192`                     | Batch size for reading Parquet data                          |
+| `--memory-limit`         | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded       |
+| `--max-buffered-batches` | `0`                        | Max batches to buffer before spilling (0 = memory-pool-only) |
+| `--write-buffer-size`    | `1048576`                  | Write buffer size in bytes                                   |
+| `--limit`                | `0`                        | Limit rows processed per iteration (0 = no limit)            |
+| `--iterations`           | `1`                        | Number of timed iterations                                   |
+| `--warmup`               | `0`                        | Number of warmup iterations before timing                    |
+| `--read-back`            | `false`                    | Also benchmark reading back the shuffle output               |
+| `--output-dir`           | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files                 |
 
 ### Profiling with flamegraph
 

From 7e16819013e744eecd548483fcf414798f071744 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:19:44 -0600
Subject: [PATCH 08/18] machete

---
 native/Cargo.lock         | 1 -
 native/shuffle/Cargo.toml | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/native/Cargo.lock b/native/Cargo.lock
index 88007af70a..2c117631a4 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -1996,7 +1996,6 @@ dependencies = [
  "log",
  "lz4_flex 0.13.0",
  "parquet",
- "rand 0.10.0",
  "simd-adler32",
  "snap",
  "tempfile",
diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml
index 14ed5f60a0..a5982c05fa 100644
--- a/native/shuffle/Cargo.toml
+++ b/native/shuffle/Cargo.toml
@@ -43,9 +43,8 @@ itertools = "0.14.0"
 jni = "0.21"
 log = "0.4"
 lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] }
-# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature)
+# parquet is only used by the shuffle_bench binary (shuffle-bench feature)
 parquet = { workspace = true, optional = true }
-rand = { workspace = true, optional = true }
 simd-adler32 = "0.3.9"
 snap = "1.1"
 tokio = { version = "1", features = ["rt-multi-thread"] }
@@ -58,7 +57,7 @@ itertools = "0.14.0"
 tempfile = "3.26.0"
 
 [features]
-shuffle-bench = ["clap", "parquet", "rand"]
+shuffle-bench = ["clap", "parquet"]
 
 [lib]
 name = "datafusion_comet_shuffle"

From e7a3661a1107407c8fe3f3941b6539e48e82c67b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 12:32:12 -0600
Subject: [PATCH 09/18] feat: add immediate-mode shuffle partitioner

Add ImmediateShufflePartitioner that repartitions each incoming batch
using arrow take and writes per-partition data directly to individual
temp files, avoiding in-memory buffering of input batches. At
shuffle_write time, per-partition files are concatenated into the
final shuffle data file and index.

Extract ScratchSpace and partition ID computation (hash, range,
round-robin) into shared scratch module, eliminating duplication
between multi_partition and immediate_partition implementations.

Configurable via spark.comet.exec.shuffle.nativeMode (default/immediate).
---
 .../scala/org/apache/comet/CometConf.scala    |  14 +
 native/core/src/execution/planner.rs          |  10 +-
 native/proto/src/proto/operator.proto         |   8 +
 native/shuffle/benches/shuffle_writer.rs      |   3 +-
 native/shuffle/src/lib.rs                     |  12 +
 .../src/partitioners/immediate_partition.rs   | 278 ++++++++++++++++++
 native/shuffle/src/partitioners/mod.rs        |   3 +
 .../src/partitioners/multi_partition.rs       | 272 ++---------------
 native/shuffle/src/partitioners/scratch.rs    | 183 ++++++++++++
 native/shuffle/src/shuffle_writer.rs          |  28 +-
 .../shuffle/CometNativeShuffleWriter.scala    |   6 +
 11 files changed, 557 insertions(+), 260 deletions(-)
 create mode 100644 native/shuffle/src/partitioners/immediate_partition.rs
 create mode 100644 native/shuffle/src/partitioners/scratch.rs

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index bfe90181ff..617f9131eb 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -534,6 +534,20 @@ object CometConf extends ShimCometConf {
       .checkValue(v => v > 0, "Write buffer size must be positive")
       .createWithDefault(1)
 
+  val COMET_NATIVE_SHUFFLE_MODE: ConfigEntry[String] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.nativeMode")
+      .category(CATEGORY_SHUFFLE)
+      .doc(
+        "Selects which native shuffle implementation to use for multi-partition shuffles. " +
+          "'default' buffers input batches and tracks per-partition row indices, writing all " +
+          "partitions at the end with memory-pressure-driven spilling. " +
+          "'immediate' repartitions each incoming batch using take and writes per-partition " +
+          "data directly to individual files, avoiding in-memory buffering of input batches. " +
+          "The 'immediate' mode is experimental.")
+      .stringConf
+      .checkValues(Set("default", "immediate"))
+      .createWithDefault("default")
+
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
     "spark.comet.shuffle.preferDictionary.ratio")
     .category(CATEGORY_SHUFFLE)
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 5af31fcc22..7680fd37a1 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -29,7 +29,7 @@ use crate::execution::{
     planner::expression_registry::ExpressionRegistry,
     planner::operator_registry::OperatorRegistry,
     serde::to_arrow_datatype,
-    shuffle::ShuffleWriterExec,
+    shuffle::{ShuffleMode, ShuffleWriterExec},
 };
 use arrow::compute::CastOptions;
 use arrow::datatypes::{DataType, Field, FieldRef, Schema, TimeUnit, DECIMAL128_MAX_PRECISION};
@@ -116,7 +116,8 @@ use datafusion_comet_proto::{
     spark_operator::{
         self, lower_window_frame_bound::LowerFrameBoundStruct, operator::OpStruct,
         upper_window_frame_bound::UpperFrameBoundStruct, BuildSide,
-        CompressionCodec as SparkCompressionCodec, JoinType, Operator, WindowFrameType,
+        CompressionCodec as SparkCompressionCodec, JoinType, Operator,
+        ShuffleMode as SparkShuffleMode, WindowFrameType,
     },
     spark_partitioning::{partitioning::PartitioningStruct, Partitioning as SparkPartitioning},
 };
@@ -1352,6 +1353,10 @@ impl PhysicalPlanner {
                 }?;
 
                 let write_buffer_size = writer.write_buffer_size as usize;
+                let shuffle_mode = match writer.shuffle_mode.try_into() {
+                    Ok(SparkShuffleMode::ImmediateShuffle) => ShuffleMode::Immediate,
+                    _ => ShuffleMode::Default,
+                };
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
                     Arc::clone(&child.native_plan),
                     partitioning,
@@ -1360,6 +1365,7 @@ impl PhysicalPlanner {
                     writer.output_index_file.clone(),
                     writer.tracing_enabled,
                     write_buffer_size,
+                    shuffle_mode,
                 )?);
 
                 Ok((
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 344b9f0f21..0ce5ae2b34 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -284,6 +284,12 @@ enum CompressionCodec {
   Snappy = 3;
 }
 
+// Selects which shuffle implementation to use for multi-partition shuffles.
+enum ShuffleMode {
+  DefaultShuffle = 0;
+  ImmediateShuffle = 1;
+}
+
 message ShuffleWriter {
   spark.spark_partitioning.Partitioning partitioning = 1;
   string output_data_file = 3;
@@ -294,6 +300,8 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Which shuffle implementation to use for multi-partition shuffles.
+  ShuffleMode shuffle_mode = 9;
 }
 
 message ParquetWriter {
diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs
index 27abd919fa..0b05190aac 100644
--- a/native/shuffle/benches/shuffle_writer.rs
+++ b/native/shuffle/benches/shuffle_writer.rs
@@ -30,7 +30,7 @@ use datafusion::{
     prelude::SessionContext,
 };
 use datafusion_comet_shuffle::{
-    CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleWriterExec,
+    CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleMode, ShuffleWriterExec,
 };
 use itertools::Itertools;
 use std::io::Cursor;
@@ -153,6 +153,7 @@ fn create_shuffle_writer_exec(
         "/tmp/index.out".to_string(),
         false,
         1024 * 1024,
+        ShuffleMode::Default,
     )
     .unwrap()
 }
diff --git a/native/shuffle/src/lib.rs b/native/shuffle/src/lib.rs
index f29588f2e1..4f85e2844b 100644
--- a/native/shuffle/src/lib.rs
+++ b/native/shuffle/src/lib.rs
@@ -27,3 +27,15 @@ pub use comet_partitioning::CometPartitioning;
 pub use ipc::read_ipc_compressed;
 pub use shuffle_writer::ShuffleWriterExec;
 pub use writers::{CompressionCodec, ShuffleBlockWriter};
+
+/// Selects which shuffle implementation to use for multi-partition shuffles.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ShuffleMode {
+    /// Default: buffer input batches, track per-partition row indices, and write
+    /// all partitions at the end with memory-pressure-driven spilling.
+    Default,
+    /// Experimental: repartition each incoming batch immediately using `take` and
+    /// write per-partition data directly to individual spill files, avoiding
+    /// in-memory buffering of input batches.
+    Immediate,
+}
diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
new file mode 100644
index 0000000000..98c20c75f9
--- /dev/null
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! An immediate-mode shuffle partitioner that repartitions incoming batches and writes
+//! directly to per-partition spill files. Unlike `MultiPartitionShuffleRepartitioner`,
+//! this implementation does not buffer input batches in memory — it uses Arrow `take`
+//! to extract per-partition slices and writes them immediately through per-partition
+//! `BufBatchWriter`s. At `shuffle_write` time, the per-partition files are concatenated
+//! into the final shuffle data file and index.
+
+use crate::metrics::ShufflePartitionerMetrics;
+use crate::partitioners::scratch::ScratchSpace;
+use crate::partitioners::ShufflePartitioner;
+use crate::writers::BufBatchWriter;
+use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter};
+use arrow::array::{ArrayRef, RecordBatch, UInt32Array};
+use arrow::compute::take;
+use arrow::datatypes::SchemaRef;
+use datafusion::common::DataFusionError;
+use datafusion::execution::disk_manager::RefCountedTempFile;
+use datafusion::execution::runtime_env::RuntimeEnv;
+use datafusion_comet_common::tracing::{with_trace, with_trace_async};
+use std::fmt;
+use std::fmt::{Debug, Formatter};
+use std::fs::{File, OpenOptions};
+use std::io::{BufReader, BufWriter, Seek, Write};
+use std::sync::Arc;
+use tokio::time::Instant;
+
+struct PartitionFileWriter {
+    temp_file: RefCountedTempFile,
+    writer: BufBatchWriter<ShuffleBlockWriter, File>,
+}
+
+/// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using
+/// `arrow::compute::take` and the per-partition slices are written directly to
+/// per-partition temporary files via `BufBatchWriter`. No input batches are retained
+/// in memory beyond the current one.
+pub(crate) struct ImmediateShufflePartitioner {
+    output_data_file: String,
+    output_index_file: String,
+    partition_writers: Vec<Option<PartitionFileWriter>>,
+    shuffle_block_writer: ShuffleBlockWriter,
+    partitioning: CometPartitioning,
+    runtime: Arc<RuntimeEnv>,
+    metrics: ShufflePartitionerMetrics,
+    scratch: ScratchSpace,
+    batch_size: usize,
+    tracing_enabled: bool,
+    write_buffer_size: usize,
+}
+
+impl ImmediateShufflePartitioner {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn try_new(
+        output_data_file: String,
+        output_index_file: String,
+        schema: SchemaRef,
+        partitioning: CometPartitioning,
+        metrics: ShufflePartitionerMetrics,
+        runtime: Arc<RuntimeEnv>,
+        batch_size: usize,
+        codec: CompressionCodec,
+        tracing_enabled: bool,
+        write_buffer_size: usize,
+    ) -> datafusion::common::Result<Self> {
+        let num_output_partitions = partitioning.partition_count();
+        assert!(
+            num_output_partitions > 1,
+            "Use SinglePartitionShufflePartitioner for 1 output partition."
+        );
+
+        let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions);
+        let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?;
+        let partition_writers = (0..num_output_partitions).map(|_| None).collect();
+
+        Ok(Self {
+            output_data_file,
+            output_index_file,
+            partition_writers,
+            shuffle_block_writer,
+            partitioning,
+            runtime,
+            metrics,
+            scratch,
+            batch_size,
+            tracing_enabled,
+            write_buffer_size,
+        })
+    }
+
+    fn ensure_partition_writer(&mut self, partition_id: usize) -> datafusion::common::Result<()> {
+        if self.partition_writers[partition_id].is_none() {
+            let temp_file = self
+                .runtime
+                .disk_manager
+                .create_tmp_file(&format!("immediate_shuffle_partition_{partition_id}"))?;
+            let file = OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(temp_file.path())
+                .map_err(|e| {
+                    DataFusionError::Execution(format!("Error creating partition file: {e}"))
+                })?;
+            let writer = BufBatchWriter::new(
+                self.shuffle_block_writer.clone(),
+                file,
+                self.write_buffer_size,
+                self.batch_size,
+            );
+            self.partition_writers[partition_id] = Some(PartitionFileWriter { temp_file, writer });
+        }
+        Ok(())
+    }
+
+    fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> {
+        if input.num_rows() == 0 {
+            return Ok(());
+        }
+
+        if input.num_rows() > self.batch_size {
+            return Err(DataFusionError::Internal(
+                "Input batch size exceeds configured batch size. Call `insert_batch` instead."
+                    .to_string(),
+            ));
+        }
+
+        self.metrics.data_size.add(input.get_array_memory_size());
+        self.metrics.baseline.record_output(input.num_rows());
+
+        let num_output_partitions = self.partitioning.partition_count();
+
+        let mut scratch = std::mem::take(&mut self.scratch);
+        {
+            let mut timer = self.metrics.repart_time.timer();
+            scratch.compute_partition_ids(&self.partitioning, &input)?;
+            timer.stop();
+        }
+
+        for partition_id in 0..num_output_partitions {
+            let start = scratch.partition_starts[partition_id] as usize;
+            let end = scratch.partition_starts[partition_id + 1] as usize;
+            if start == end {
+                continue;
+            }
+
+            let indices = UInt32Array::from_iter_values(
+                scratch.partition_row_indices[start..end].iter().copied(),
+            );
+            let columns: Vec<ArrayRef> = input
+                .columns()
+                .iter()
+                .map(|col| {
+                    take(col, &indices, None)
+                        .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
+                })
+                .collect::<datafusion::common::Result<Vec<_>>>()?;
+            let partition_batch = RecordBatch::try_new(input.schema(), columns)?;
+
+            self.ensure_partition_writer(partition_id)?;
+            let pw = self.partition_writers[partition_id].as_mut().unwrap();
+            pw.writer.write(
+                &partition_batch,
+                &self.metrics.encode_time,
+                &self.metrics.write_time,
+            )?;
+        }
+
+        self.scratch = scratch;
+        Ok(())
+    }
+}
+
+#[async_trait::async_trait]
+impl ShufflePartitioner for ImmediateShufflePartitioner {
+    async fn insert_batch(&mut self, batch: RecordBatch) -> datafusion::common::Result<()> {
+        with_trace_async(
+            "immediate_shuffle_insert_batch",
+            self.tracing_enabled,
+            || async {
+                let start_time = Instant::now();
+                let mut start = 0;
+                while start < batch.num_rows() {
+                    let end = (start + self.batch_size).min(batch.num_rows());
+                    let slice = batch.slice(start, end - start);
+                    self.partitioning_batch(slice)?;
+                    start = end;
+                }
+                self.metrics.input_batches.add(1);
+                self.metrics
+                    .baseline
+                    .elapsed_compute()
+                    .add_duration(start_time.elapsed());
+                Ok(())
+            },
+        )
+        .await
+    }
+
+    fn shuffle_write(&mut self) -> datafusion::common::Result<()> {
+        with_trace("immediate_shuffle_write", self.tracing_enabled, || {
+            let start_time = Instant::now();
+
+            let num_output_partitions = self.partition_writers.len();
+            let mut offsets = vec![0u64; num_output_partitions + 1];
+
+            let output_data = OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(&self.output_data_file)
+                .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?;
+
+            let mut output_data = BufWriter::new(output_data);
+
+            for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() {
+                offsets[partition_id] = output_data.stream_position()?;
+
+                if let Some(pw) = pw_slot {
+                    pw.writer
+                        .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
+
+                    let mut spill_file = BufReader::new(File::open(pw.temp_file.path())?);
+                    let mut write_timer = self.metrics.write_time.timer();
+                    std::io::copy(&mut spill_file, &mut output_data)?;
+                    write_timer.stop();
+                }
+            }
+
+            let mut write_timer = self.metrics.write_time.timer();
+            output_data.flush()?;
+            write_timer.stop();
+
+            offsets[num_output_partitions] = output_data.stream_position()?;
+
+            let mut write_timer = self.metrics.write_time.timer();
+            let mut output_index =
+                BufWriter::new(File::create(&self.output_index_file).map_err(|e| {
+                    DataFusionError::Execution(format!("shuffle write error: {e:?}"))
+                })?);
+            for offset in offsets {
+                output_index.write_all(&(offset as i64).to_le_bytes()[..])?;
+            }
+            output_index.flush()?;
+            write_timer.stop();
+
+            self.metrics
+                .baseline
+                .elapsed_compute()
+                .add_duration(start_time.elapsed());
+
+            Ok(())
+        })
+    }
+}
+
+impl Debug for ImmediateShufflePartitioner {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ImmediateShufflePartitioner")
+            .field("num_partitions", &self.partition_writers.len())
+            .finish()
+    }
+}
diff --git a/native/shuffle/src/partitioners/mod.rs b/native/shuffle/src/partitioners/mod.rs
index 3eedef62c7..bb3d7b8c88 100644
--- a/native/shuffle/src/partitioners/mod.rs
+++ b/native/shuffle/src/partitioners/mod.rs
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod immediate_partition;
 mod multi_partition;
 mod partitioned_batch_iterator;
+pub(crate) mod scratch;
 mod single_partition;
 mod traits;
 
+pub(crate) use immediate_partition::ImmediateShufflePartitioner;
 pub(crate) use multi_partition::MultiPartitionShuffleRepartitioner;
 pub(crate) use partitioned_batch_iterator::PartitionedBatchIterator;
 pub(crate) use single_partition::SinglePartitionShufflePartitioner;
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 655bee3511..8a5007142c 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -19,10 +19,11 @@ use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::partitioned_batch_iterator::{
     PartitionedBatchIterator, PartitionedBatchesProducer,
 };
+use crate::partitioners::scratch::ScratchSpace;
 use crate::partitioners::ShufflePartitioner;
 use crate::writers::{BufBatchWriter, PartitionWriter};
-use crate::{comet_partitioning, CometPartitioning, CompressionCodec, ShuffleBlockWriter};
-use arrow::array::{ArrayRef, RecordBatch};
+use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter};
+use arrow::array::RecordBatch;
 use arrow::datatypes::SchemaRef;
 use datafusion::common::utils::proxy::VecAllocExt;
 use datafusion::common::DataFusionError;
@@ -30,7 +31,6 @@ use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::physical_plan::metrics::Time;
 use datafusion_comet_common::tracing::{with_trace, with_trace_async};
-use datafusion_comet_spark_expr::murmur3::create_murmur3_hashes;
 use itertools::Itertools;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
@@ -39,71 +39,6 @@ use std::io::{BufReader, BufWriter, Seek, Write};
 use std::sync::Arc;
 use tokio::time::Instant;
 
-/// Reusable scratch buffers for computing row-to-partition assignments.
-#[derive(Default)]
-struct ScratchSpace {
-    /// Hashes for each row in the current batch.
-    hashes_buf: Vec<u32>,
-    /// Partition ids for each row in the current batch.
-    partition_ids: Vec<u32>,
-    /// The row indices of the rows in each partition. This array is conceptually divided into
-    /// partitions, where each partition contains the row indices of the rows in that partition.
-    /// The length of this array is the same as the number of rows in the batch.
-    partition_row_indices: Vec<u32>,
-    /// The start indices of partitions in partition_row_indices. partition_starts[K] and
-    /// partition_starts[K + 1] are the start and end indices of partition K in partition_row_indices.
-    /// The length of this array is 1 + the number of partitions.
-    partition_starts: Vec<u32>,
-}
-
-impl ScratchSpace {
-    fn map_partition_ids_to_starts_and_indices(
-        &mut self,
-        num_output_partitions: usize,
-        num_rows: usize,
-    ) {
-        let partition_ids = &mut self.partition_ids[..num_rows];
-
-        // count each partition size, while leaving the last extra element as 0
-        let partition_counters = &mut self.partition_starts;
-        partition_counters.resize(num_output_partitions + 1, 0);
-        partition_counters.fill(0);
-        partition_ids
-            .iter()
-            .for_each(|partition_id| partition_counters[*partition_id as usize] += 1);
-
-        // accumulate partition counters into partition ends
-        // e.g. partition counter: [1, 3, 2, 1, 0] => [1, 4, 6, 7, 7]
-        let partition_ends = partition_counters;
-        let mut accum = 0;
-        partition_ends.iter_mut().for_each(|v| {
-            *v += accum;
-            accum = *v;
-        });
-
-        // calculate partition row indices and partition starts
-        // e.g. partition ids: [3, 1, 1, 1, 2, 2, 0] will produce the following partition_row_indices
-        // and partition_starts arrays:
-        //
-        //  partition_row_indices: [6, 1, 2, 3, 4, 5, 0]
-        //  partition_starts: [0, 1, 4, 6, 7]
-        //
-        // partition_starts conceptually splits partition_row_indices into smaller slices.
-        // Each slice partition_row_indices[partition_starts[K]..partition_starts[K + 1]] contains the
-        // row indices of the input batch that are partitioned into partition K. For example,
-        // first partition 0 has one row index [6], partition 1 has row indices [1, 2, 3], etc.
-        let partition_row_indices = &mut self.partition_row_indices;
-        partition_row_indices.resize(num_rows, 0);
-        for (index, partition_id) in partition_ids.iter().enumerate().rev() {
-            partition_ends[*partition_id as usize] -= 1;
-            let end = partition_ends[*partition_id as usize];
-            partition_row_indices[end as usize] = index as u32;
-        }
-
-        // after calculating, partition ends become partition starts
-    }
-}
-
 /// A partitioner that uses a hash function to partition data into multiple partitions
 pub(crate) struct MultiPartitionShuffleRepartitioner {
     output_data_file: String,
@@ -148,22 +83,7 @@ impl MultiPartitionShuffleRepartitioner {
             "Use SinglePartitionShufflePartitioner for 1 output partition."
         );
 
-        // Vectors in the scratch space will be filled with valid values before being used, this
-        // initialization code is simply initializing the vectors to the desired size.
-        // The initial values are not used.
-        let scratch = ScratchSpace {
-            hashes_buf: match partitioning {
-                // Allocate hashes_buf for hash and round robin partitioning.
-                // Round robin hashes all columns to achieve even, deterministic distribution.
-                CometPartitioning::Hash(_, _) | CometPartitioning::RoundRobin(_, _) => {
-                    vec![0; batch_size]
-                }
-                _ => vec![],
-            },
-            partition_ids: vec![0; batch_size],
-            partition_row_indices: vec![0; batch_size],
-            partition_starts: vec![0; num_output_partitions + 1],
-        };
+        let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions);
 
         let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone())?;
 
@@ -217,178 +137,20 @@ impl MultiPartitionShuffleRepartitioner {
         // number of rows those are written to output data file.
         self.metrics.baseline.record_output(input.num_rows());
 
-        match &self.partitioning {
-            CometPartitioning::Hash(exprs, num_output_partitions) => {
-                let mut scratch = std::mem::take(&mut self.scratch);
-                let (partition_starts, partition_row_indices): (&Vec<u32>, &Vec<u32>) = {
-                    let mut timer = self.metrics.repart_time.timer();
-
-                    // Evaluate partition expressions to get rows to apply partitioning scheme.
-                    let arrays = exprs
-                        .iter()
-                        .map(|expr| expr.evaluate(&input)?.into_array(input.num_rows()))
-                        .collect::<datafusion::common::Result<Vec<_>>>()?;
-
-                    let num_rows = arrays[0].len();
-
-                    // Use identical seed as Spark hash partitioning.
-                    let hashes_buf = &mut scratch.hashes_buf[..num_rows];
-                    hashes_buf.fill(42_u32);
-
-                    // Generate partition ids for every row.
-                    {
-                        // Hash arrays and compute partition ids based on number of partitions.
-                        let partition_ids = &mut scratch.partition_ids[..num_rows];
-                        create_murmur3_hashes(&arrays, hashes_buf)?
-                            .iter()
-                            .enumerate()
-                            .for_each(|(idx, hash)| {
-                                partition_ids[idx] =
-                                    comet_partitioning::pmod(*hash, *num_output_partitions) as u32;
-                            });
-                    }
-
-                    // We now have partition ids for every input row, map that to partition starts
-                    // and partition indices to eventually right these rows to partition buffers.
-                    scratch
-                        .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows);
-
-                    timer.stop();
-                    Ok::<(&Vec<u32>, &Vec<u32>), DataFusionError>((
-                        &scratch.partition_starts,
-                        &scratch.partition_row_indices,
-                    ))
-                }?;
-
-                self.buffer_partitioned_batch_may_spill(
-                    input,
-                    partition_row_indices,
-                    partition_starts,
-                )
-                .await?;
-                self.scratch = scratch;
-            }
-            CometPartitioning::RangePartitioning(
-                lex_ordering,
-                num_output_partitions,
-                row_converter,
-                bounds,
-            ) => {
-                let mut scratch = std::mem::take(&mut self.scratch);
-                let (partition_starts, partition_row_indices): (&Vec<u32>, &Vec<u32>) = {
-                    let mut timer = self.metrics.repart_time.timer();
-
-                    // Evaluate partition expressions for values to apply partitioning scheme on.
-                    let arrays = lex_ordering
-                        .iter()
-                        .map(|expr| expr.expr.evaluate(&input)?.into_array(input.num_rows()))
-                        .collect::<datafusion::common::Result<Vec<_>>>()?;
-
-                    let num_rows = arrays[0].len();
-
-                    // Generate partition ids for every row, first by converting the partition
-                    // arrays to Rows, and then doing binary search for each Row against the
-                    // bounds Rows.
-                    {
-                        let row_batch = row_converter.convert_columns(arrays.as_slice())?;
-                        let partition_ids = &mut scratch.partition_ids[..num_rows];
-
-                        row_batch.iter().enumerate().for_each(|(row_idx, row)| {
-                            partition_ids[row_idx] = bounds
-                                .as_slice()
-                                .partition_point(|bound| bound.row() <= row)
-                                as u32
-                        });
-                    }
-
-                    // We now have partition ids for every input row, map that to partition starts
-                    // and partition indices to eventually right these rows to partition buffers.
-                    scratch
-                        .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows);
-
-                    timer.stop();
-                    Ok::<(&Vec<u32>, &Vec<u32>), DataFusionError>((
-                        &scratch.partition_starts,
-                        &scratch.partition_row_indices,
-                    ))
-                }?;
-
-                self.buffer_partitioned_batch_may_spill(
-                    input,
-                    partition_row_indices,
-                    partition_starts,
-                )
-                .await?;
-                self.scratch = scratch;
-            }
-            CometPartitioning::RoundRobin(num_output_partitions, max_hash_columns) => {
-                // Comet implements "round robin" as hash partitioning on columns.
-                // This achieves the same goal as Spark's round robin (even distribution
-                // without semantic grouping) while being deterministic for fault tolerance.
-                //
-                // Note: This produces different partition assignments than Spark's round robin,
-                // which sorts by UnsafeRow binary representation before assigning partitions.
-                // However, both approaches provide even distribution and determinism.
-                let mut scratch = std::mem::take(&mut self.scratch);
-                let (partition_starts, partition_row_indices): (&Vec<u32>, &Vec<u32>) = {
-                    let mut timer = self.metrics.repart_time.timer();
-
-                    let num_rows = input.num_rows();
-
-                    // Collect columns for hashing, respecting max_hash_columns limit
-                    // max_hash_columns of 0 means no limit (hash all columns)
-                    // Negative values are normalized to 0 in the planner
-                    let num_columns_to_hash = if *max_hash_columns == 0 {
-                        input.num_columns()
-                    } else {
-                        (*max_hash_columns).min(input.num_columns())
-                    };
-                    let columns_to_hash: Vec<ArrayRef> = (0..num_columns_to_hash)
-                        .map(|i| Arc::clone(input.column(i)))
-                        .collect();
-
-                    // Use identical seed as Spark hash partitioning.
-                    let hashes_buf = &mut scratch.hashes_buf[..num_rows];
-                    hashes_buf.fill(42_u32);
-
-                    // Compute hash for selected columns
-                    create_murmur3_hashes(&columns_to_hash, hashes_buf)?;
-
-                    // Assign partition IDs based on hash (same as hash partitioning)
-                    let partition_ids = &mut scratch.partition_ids[..num_rows];
-                    hashes_buf.iter().enumerate().for_each(|(idx, hash)| {
-                        partition_ids[idx] =
-                            comet_partitioning::pmod(*hash, *num_output_partitions) as u32;
-                    });
-
-                    // We now have partition ids for every input row, map that to partition starts
-                    // and partition indices to eventually write these rows to partition buffers.
-                    scratch
-                        .map_partition_ids_to_starts_and_indices(*num_output_partitions, num_rows);
-
-                    timer.stop();
-                    Ok::<(&Vec<u32>, &Vec<u32>), DataFusionError>((
-                        &scratch.partition_starts,
-                        &scratch.partition_row_indices,
-                    ))
-                }?;
-
-                self.buffer_partitioned_batch_may_spill(
-                    input,
-                    partition_row_indices,
-                    partition_starts,
-                )
-                .await?;
-                self.scratch = scratch;
-            }
-            other => {
-                // this should be unreachable as long as the validation logic
-                // in the constructor is kept up-to-date
-                return Err(DataFusionError::NotImplemented(format!(
-                    "Unsupported shuffle partitioning scheme {other:?}"
-                )));
-            }
+        let mut scratch = std::mem::take(&mut self.scratch);
+        {
+            let mut timer = self.metrics.repart_time.timer();
+            scratch.compute_partition_ids(&self.partitioning, &input)?;
+            timer.stop();
         }
+
+        self.buffer_partitioned_batch_may_spill(
+            input,
+            &scratch.partition_row_indices,
+            &scratch.partition_starts,
+        )
+        .await?;
+        self.scratch = scratch;
         Ok(())
     }
 
diff --git a/native/shuffle/src/partitioners/scratch.rs b/native/shuffle/src/partitioners/scratch.rs
new file mode 100644
index 0000000000..af0ede558f
--- /dev/null
+++ b/native/shuffle/src/partitioners/scratch.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Shared scratch buffers and partition ID computation for shuffle partitioners.
+
+use crate::{comet_partitioning, CometPartitioning};
+use arrow::array::{ArrayRef, RecordBatch};
+use datafusion::common::DataFusionError;
+use datafusion_comet_spark_expr::murmur3::create_murmur3_hashes;
+use std::sync::Arc;
+
+/// Reusable scratch buffers for computing row-to-partition assignments.
+#[derive(Default)]
+pub(crate) struct ScratchSpace {
+    /// Hashes for each row in the current batch.
+    pub(crate) hashes_buf: Vec<u32>,
+    /// Partition ids for each row in the current batch.
+    pub(crate) partition_ids: Vec<u32>,
+    /// The row indices of the rows in each partition. This array is conceptually divided into
+    /// partitions, where each partition contains the row indices of the rows in that partition.
+    /// The length of this array is the same as the number of rows in the batch.
+    pub(crate) partition_row_indices: Vec<u32>,
+    /// The start indices of partitions in partition_row_indices. partition_starts[K] and
+    /// partition_starts[K + 1] are the start and end indices of partition K in partition_row_indices.
+    /// The length of this array is 1 + the number of partitions.
+    pub(crate) partition_starts: Vec<u32>,
+}
+
+impl ScratchSpace {
+    /// Create a new ScratchSpace with pre-allocated buffers for the given partitioning scheme.
+    pub(crate) fn new(
+        partitioning: &CometPartitioning,
+        batch_size: usize,
+        num_output_partitions: usize,
+    ) -> Self {
+        Self {
+            hashes_buf: match partitioning {
+                CometPartitioning::Hash(_, _) | CometPartitioning::RoundRobin(_, _) => {
+                    vec![0; batch_size]
+                }
+                _ => vec![],
+            },
+            partition_ids: vec![0; batch_size],
+            partition_row_indices: vec![0; batch_size],
+            partition_starts: vec![0; num_output_partitions + 1],
+        }
+    }
+
+    pub(crate) fn map_partition_ids_to_starts_and_indices(
+        &mut self,
+        num_output_partitions: usize,
+        num_rows: usize,
+    ) {
+        let partition_ids = &mut self.partition_ids[..num_rows];
+
+        // count each partition size, while leaving the last extra element as 0
+        let partition_counters = &mut self.partition_starts;
+        partition_counters.resize(num_output_partitions + 1, 0);
+        partition_counters.fill(0);
+        partition_ids
+            .iter()
+            .for_each(|partition_id| partition_counters[*partition_id as usize] += 1);
+
+        // accumulate partition counters into partition ends
+        // e.g. partition counter: [1, 3, 2, 1, 0] => [1, 4, 6, 7, 7]
+        let partition_ends = partition_counters;
+        let mut accum = 0;
+        partition_ends.iter_mut().for_each(|v| {
+            *v += accum;
+            accum = *v;
+        });
+
+        // calculate partition row indices and partition starts
+        // e.g. partition ids: [3, 1, 1, 1, 2, 2, 0] will produce the following partition_row_indices
+        // and partition_starts arrays:
+        //
+        //  partition_row_indices: [6, 1, 2, 3, 4, 5, 0]
+        //  partition_starts: [0, 1, 4, 6, 7]
+        //
+        // partition_starts conceptually splits partition_row_indices into smaller slices.
+        // Each slice partition_row_indices[partition_starts[K]..partition_starts[K + 1]] contains the
+        // row indices of the input batch that are partitioned into partition K. For example,
+        // first partition 0 has one row index [6], partition 1 has row indices [1, 2, 3], etc.
+        let partition_row_indices = &mut self.partition_row_indices;
+        partition_row_indices.resize(num_rows, 0);
+        for (index, partition_id) in partition_ids.iter().enumerate().rev() {
+            partition_ends[*partition_id as usize] -= 1;
+            let end = partition_ends[*partition_id as usize];
+            partition_row_indices[end as usize] = index as u32;
+        }
+
+        // after calculating, partition ends become partition starts
+    }
+
+    /// Compute partition IDs for the given batch and populate `partition_starts` and
+    /// `partition_row_indices`. Returns the number of rows processed.
+    pub(crate) fn compute_partition_ids(
+        &mut self,
+        partitioning: &CometPartitioning,
+        input: &RecordBatch,
+    ) -> datafusion::common::Result<()> {
+        match partitioning {
+            CometPartitioning::Hash(exprs, num_partitions) => {
+                let arrays = exprs
+                    .iter()
+                    .map(|expr| expr.evaluate(input)?.into_array(input.num_rows()))
+                    .collect::<datafusion::common::Result<Vec<_>>>()?;
+                let num_rows = arrays[0].len();
+                let hashes_buf = &mut self.hashes_buf[..num_rows];
+                hashes_buf.fill(42_u32);
+                let partition_ids = &mut self.partition_ids[..num_rows];
+                create_murmur3_hashes(&arrays, hashes_buf)?
+                    .iter()
+                    .enumerate()
+                    .for_each(|(idx, hash)| {
+                        partition_ids[idx] =
+                            comet_partitioning::pmod(*hash, *num_partitions) as u32;
+                    });
+                self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows);
+            }
+            CometPartitioning::RangePartitioning(
+                lex_ordering,
+                num_partitions,
+                row_converter,
+                bounds,
+            ) => {
+                let arrays = lex_ordering
+                    .iter()
+                    .map(|expr| expr.expr.evaluate(input)?.into_array(input.num_rows()))
+                    .collect::<datafusion::common::Result<Vec<_>>>()?;
+                let num_rows = arrays[0].len();
+                let row_batch = row_converter.convert_columns(arrays.as_slice())?;
+                let partition_ids = &mut self.partition_ids[..num_rows];
+                row_batch.iter().enumerate().for_each(|(row_idx, row)| {
+                    partition_ids[row_idx] = bounds
+                        .as_slice()
+                        .partition_point(|bound| bound.row() <= row)
+                        as u32
+                });
+                self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows);
+            }
+            CometPartitioning::RoundRobin(num_partitions, max_hash_columns) => {
+                let num_rows = input.num_rows();
+                let num_columns_to_hash = if *max_hash_columns == 0 {
+                    input.num_columns()
+                } else {
+                    (*max_hash_columns).min(input.num_columns())
+                };
+                let columns_to_hash: Vec<ArrayRef> = (0..num_columns_to_hash)
+                    .map(|i| Arc::clone(input.column(i)))
+                    .collect();
+                let hashes_buf = &mut self.hashes_buf[..num_rows];
+                hashes_buf.fill(42_u32);
+                create_murmur3_hashes(&columns_to_hash, hashes_buf)?;
+                let partition_ids = &mut self.partition_ids[..num_rows];
+                hashes_buf.iter().enumerate().for_each(|(idx, hash)| {
+                    partition_ids[idx] = comet_partitioning::pmod(*hash, *num_partitions) as u32;
+                });
+                self.map_partition_ids_to_starts_and_indices(*num_partitions, num_rows);
+            }
+            other => {
+                return Err(DataFusionError::NotImplemented(format!(
+                    "Unsupported shuffle partitioning scheme {other:?}"
+                )));
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index e649aaac69..9067777e3c 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -19,9 +19,10 @@
 
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::{
-    MultiPartitionShuffleRepartitioner, ShufflePartitioner, SinglePartitionShufflePartitioner,
+    ImmediateShufflePartitioner, MultiPartitionShuffleRepartitioner, ShufflePartitioner,
+    SinglePartitionShufflePartitioner,
 };
-use crate::{CometPartitioning, CompressionCodec};
+use crate::{CometPartitioning, CompressionCodec, ShuffleMode};
 use async_trait::async_trait;
 use datafusion::common::exec_datafusion_err;
 use datafusion::physical_expr::{EquivalenceProperties, Partitioning};
@@ -68,6 +69,8 @@ pub struct ShuffleWriterExec {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Which shuffle implementation to use for multi-partition shuffles
+    shuffle_mode: ShuffleMode,
 }
 
 impl ShuffleWriterExec {
@@ -81,6 +84,7 @@ impl ShuffleWriterExec {
         output_index_file: String,
         tracing_enabled: bool,
         write_buffer_size: usize,
+        shuffle_mode: ShuffleMode,
     ) -> Result<Self> {
         let cache = PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&input.schema())),
@@ -99,6 +103,7 @@ impl ShuffleWriterExec {
             codec,
             tracing_enabled,
             write_buffer_size,
+            shuffle_mode,
         })
     }
 }
@@ -163,6 +168,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                 self.output_index_file.clone(),
                 self.tracing_enabled,
                 self.write_buffer_size,
+                self.shuffle_mode,
             )?)),
             _ => panic!("ShuffleWriterExec wrong number of children"),
         }
@@ -190,6 +196,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                     self.codec.clone(),
                     self.tracing_enabled,
                     self.write_buffer_size,
+                    self.shuffle_mode,
                 )
                 .map_err(|e| ArrowError::ExternalError(Box::new(e))),
             )
@@ -210,6 +217,7 @@ async fn external_shuffle(
     codec: CompressionCodec,
     tracing_enabled: bool,
     write_buffer_size: usize,
+    shuffle_mode: ShuffleMode,
 ) -> Result<SendableRecordBatchStream> {
     with_trace_async("external_shuffle", tracing_enabled, || async {
         let schema = input.schema();
@@ -226,6 +234,20 @@ async fn external_shuffle(
                     write_buffer_size,
                 )?)
             }
+            _ if shuffle_mode == ShuffleMode::Immediate => {
+                Box::new(ImmediateShufflePartitioner::try_new(
+                    output_data_file,
+                    output_index_file,
+                    Arc::clone(&schema),
+                    partitioning,
+                    metrics,
+                    context.runtime_env(),
+                    context.session_config().batch_size(),
+                    codec,
+                    tracing_enabled,
+                    write_buffer_size,
+                )?)
+            }
             _ => Box::new(MultiPartitionShuffleRepartitioner::try_new(
                 partition,
                 output_data_file,
@@ -466,6 +488,7 @@ mod test {
                 "/tmp/index.out".to_string(),
                 false,
                 1024 * 1024, // write_buffer_size: 1MB default
+                ShuffleMode::Default,
             )
             .unwrap();
 
@@ -525,6 +548,7 @@ mod test {
                 index_file.clone(),
                 false,
                 1024 * 1024,
+                ShuffleMode::Default,
             )
             .unwrap();
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index 3fc222bd19..d53d5c965c 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -193,6 +193,12 @@ class CometNativeShuffleWriter[K, V](
       shuffleWriterBuilder.setWriteBufferSize(
         CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt)
 
+      val shuffleMode = CometConf.COMET_NATIVE_SHUFFLE_MODE.get() match {
+        case "immediate" => OperatorOuterClass.ShuffleMode.ImmediateShuffle
+        case _ => OperatorOuterClass.ShuffleMode.DefaultShuffle
+      }
+      shuffleWriterBuilder.setShuffleMode(shuffleMode)
+
       outputPartitioning match {
         case p if isSinglePartitioning(p) =>
           val partitioning = PartitioningOuterClass.SinglePartition.newBuilder()

From 4e6c02638e74f2e3a4219a940740d6ae185df54f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 13:16:42 -0600
Subject: [PATCH 10/18] perf: optimize immediate shuffle with
 single-take-then-slice

Replace per-partition take calls (P*C = 3200 for 200 partitions, 16
columns) with a single take per column to reorder the entire batch,
then zero-copy RecordBatch::slice per partition. Also remove BufReader
wrapper in shuffle_write copy loop to enable kernel zero-copy on Linux.

Add --shuffle-mode flag to shuffle_bench binary.

Benchmark (10M rows, 200 partitions, hash on cols 0,3, zstd):
  default:   4.804s avg (2.08M rows/s)
  immediate: 4.185s avg (2.39M rows/s)  -- 13% faster
---
 native/shuffle/src/bin/shuffle_bench.rs       | 13 ++++++-
 .../src/partitioners/immediate_partition.rs   | 37 +++++++++++--------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index ec1227a2ee..89be7c3bb8 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -46,7 +46,7 @@ use datafusion::physical_plan::common::collect;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
-    read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
+    read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleMode, ShuffleWriterExec,
 };
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use std::fs;
@@ -115,6 +115,10 @@ struct Args {
     /// Limit rows processed per iteration (0 = no limit)
     #[arg(long, default_value_t = 0)]
     limit: usize,
+
+    /// Shuffle mode: default or immediate
+    #[arg(long, default_value = "default")]
+    shuffle_mode: String,
 }
 
 fn main() {
@@ -143,6 +147,7 @@ fn main() {
     println!("Partitions:     {}", args.partitions);
     println!("Codec:          {:?}", codec);
     println!("Hash columns:   {:?}", hash_col_indices);
+    println!("Shuffle mode:   {}", args.shuffle_mode);
     if let Some(mem_limit) = args.memory_limit {
         println!("Memory limit:   {}", format_bytes(mem_limit));
     }
@@ -351,6 +356,11 @@ fn run_shuffle_write(
             parquet_plan
         };
 
+        let shuffle_mode = match args.shuffle_mode.as_str() {
+            "immediate" => ShuffleMode::Immediate,
+            _ => ShuffleMode::Default,
+        };
+
         let exec = ShuffleWriterExec::try_new(
             input,
             partitioning,
@@ -359,6 +369,7 @@ fn run_shuffle_write(
             index_file.to_string(),
             false,
             args.write_buffer_size,
+            shuffle_mode,
         )
         .expect("Failed to create ShuffleWriterExec");
 
diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index 98c20c75f9..65fd746f0b 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -37,7 +37,7 @@ use datafusion_comet_common::tracing::{with_trace, with_trace_async};
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::fs::{File, OpenOptions};
-use std::io::{BufReader, BufWriter, Seek, Write};
+use std::io::{BufWriter, Seek, Write};
 use std::sync::Arc;
 use tokio::time::Instant;
 
@@ -152,6 +152,22 @@ impl ImmediateShufflePartitioner {
             timer.stop();
         }
 
+        // Single take per column to reorder entire batch by partition assignment,
+        // then zero-copy slice per partition. This replaces P*C take calls with just C.
+        let num_rows = input.num_rows();
+        let all_indices = UInt32Array::from_iter_values(
+            scratch.partition_row_indices[..num_rows].iter().copied(),
+        );
+        let sorted_columns: Vec<ArrayRef> = input
+            .columns()
+            .iter()
+            .map(|col| {
+                take(col, &all_indices, None)
+                    .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
+            })
+            .collect::<datafusion::common::Result<Vec<_>>>()?;
+        let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?;
+
         for partition_id in 0..num_output_partitions {
             let start = scratch.partition_starts[partition_id] as usize;
             let end = scratch.partition_starts[partition_id + 1] as usize;
@@ -159,18 +175,7 @@ impl ImmediateShufflePartitioner {
                 continue;
             }
 
-            let indices = UInt32Array::from_iter_values(
-                scratch.partition_row_indices[start..end].iter().copied(),
-            );
-            let columns: Vec<ArrayRef> = input
-                .columns()
-                .iter()
-                .map(|col| {
-                    take(col, &indices, None)
-                        .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
-                })
-                .collect::<datafusion::common::Result<Vec<_>>>()?;
-            let partition_batch = RecordBatch::try_new(input.schema(), columns)?;
+            let partition_batch = sorted_batch.slice(start, end - start);
 
             self.ensure_partition_writer(partition_id)?;
             let pw = self.partition_writers[partition_id].as_mut().unwrap();
@@ -231,11 +236,13 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
             for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() {
                 offsets[partition_id] = output_data.stream_position()?;
 
-                if let Some(pw) = pw_slot {
+                if let Some(mut pw) = pw_slot.take() {
                     pw.writer
                         .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
-                    let mut spill_file = BufReader::new(File::open(pw.temp_file.path())?);
+                    // Reopen for read without BufReader to allow kernel zero-copy
+                    // (copy_file_range/sendfile) on supported platforms
+                    let mut spill_file = File::open(pw.temp_file.path())?;
                     let mut write_timer = self.metrics.write_time.timer();
                     std::io::copy(&mut spill_file, &mut output_data)?;
                     write_timer.stop();

From 97b0fe065a3a23e68b6d24e8bb8b5cc7599a4fee Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 13:17:52 -0600
Subject: [PATCH 11/18] feat: make immediate shuffle mode the default

---
 common/src/main/scala/org/apache/comet/CometConf.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index 617f9131eb..3b50ccad1a 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -546,7 +546,7 @@ object CometConf extends ShimCometConf {
           "The 'immediate' mode is experimental.")
       .stringConf
       .checkValues(Set("default", "immediate"))
-      .createWithDefault("default")
+      .createWithDefault("immediate")
 
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
     "spark.comet.shuffle.preferDictionary.ratio")

From 7ccda152eeefe936d6347bd0500676ff6aa58623 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 13:21:10 -0600
Subject: [PATCH 12/18] refactor: rename Default shuffle mode to Buffered

---
 common/src/main/scala/org/apache/comet/CometConf.scala    | 7 +++----
 native/core/src/execution/planner.rs                      | 4 ++--
 native/proto/src/proto/operator.proto                     | 2 +-
 native/shuffle/benches/shuffle_writer.rs                  | 2 +-
 native/shuffle/src/bin/shuffle_bench.rs                   | 8 ++++----
 native/shuffle/src/lib.rs                                 | 4 ++--
 native/shuffle/src/shuffle_writer.rs                      | 4 ++--
 .../execution/shuffle/CometNativeShuffleWriter.scala      | 4 ++--
 8 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index 3b50ccad1a..908c5638c1 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -539,13 +539,12 @@ object CometConf extends ShimCometConf {
       .category(CATEGORY_SHUFFLE)
       .doc(
         "Selects which native shuffle implementation to use for multi-partition shuffles. " +
-          "'default' buffers input batches and tracks per-partition row indices, writing all " +
+          "'buffered' buffers input batches and tracks per-partition row indices, writing all " +
           "partitions at the end with memory-pressure-driven spilling. " +
           "'immediate' repartitions each incoming batch using take and writes per-partition " +
-          "data directly to individual files, avoiding in-memory buffering of input batches. " +
-          "The 'immediate' mode is experimental.")
+          "data directly to individual files, avoiding in-memory buffering of input batches.")
       .stringConf
-      .checkValues(Set("default", "immediate"))
+      .checkValues(Set("buffered", "immediate"))
       .createWithDefault("immediate")
 
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 7680fd37a1..7b9bf5a3e5 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1354,8 +1354,8 @@ impl PhysicalPlanner {
 
                 let write_buffer_size = writer.write_buffer_size as usize;
                 let shuffle_mode = match writer.shuffle_mode.try_into() {
-                    Ok(SparkShuffleMode::ImmediateShuffle) => ShuffleMode::Immediate,
-                    _ => ShuffleMode::Default,
+                    Ok(SparkShuffleMode::BufferedShuffle) => ShuffleMode::Buffered,
+                    _ => ShuffleMode::Immediate,
                 };
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
                     Arc::clone(&child.native_plan),
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 0ce5ae2b34..7f6d947315 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -286,7 +286,7 @@ enum CompressionCodec {
 
 // Selects which shuffle implementation to use for multi-partition shuffles.
 enum ShuffleMode {
-  DefaultShuffle = 0;
+  BufferedShuffle = 0;
   ImmediateShuffle = 1;
 }
 
diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs
index 0b05190aac..fcb2e592f1 100644
--- a/native/shuffle/benches/shuffle_writer.rs
+++ b/native/shuffle/benches/shuffle_writer.rs
@@ -153,7 +153,7 @@ fn create_shuffle_writer_exec(
         "/tmp/index.out".to_string(),
         false,
         1024 * 1024,
-        ShuffleMode::Default,
+        ShuffleMode::Buffered,
     )
     .unwrap()
 }
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 89be7c3bb8..4dba0d8c41 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -116,8 +116,8 @@ struct Args {
     #[arg(long, default_value_t = 0)]
     limit: usize,
 
-    /// Shuffle mode: default or immediate
-    #[arg(long, default_value = "default")]
+    /// Shuffle mode: buffered or immediate
+    #[arg(long, default_value = "immediate")]
     shuffle_mode: String,
 }
 
@@ -357,8 +357,8 @@ fn run_shuffle_write(
         };
 
         let shuffle_mode = match args.shuffle_mode.as_str() {
-            "immediate" => ShuffleMode::Immediate,
-            _ => ShuffleMode::Default,
+            "buffered" => ShuffleMode::Buffered,
+            _ => ShuffleMode::Immediate,
         };
 
         let exec = ShuffleWriterExec::try_new(
diff --git a/native/shuffle/src/lib.rs b/native/shuffle/src/lib.rs
index 4f85e2844b..fb961752f6 100644
--- a/native/shuffle/src/lib.rs
+++ b/native/shuffle/src/lib.rs
@@ -31,9 +31,9 @@ pub use writers::{CompressionCodec, ShuffleBlockWriter};
 /// Selects which shuffle implementation to use for multi-partition shuffles.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ShuffleMode {
-    /// Default: buffer input batches, track per-partition row indices, and write
+    /// Buffered: buffer input batches, track per-partition row indices, and write
     /// all partitions at the end with memory-pressure-driven spilling.
-    Default,
+    Buffered,
     /// Experimental: repartition each incoming batch immediately using `take` and
     /// write per-partition data directly to individual spill files, avoiding
     /// in-memory buffering of input batches.
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index 9067777e3c..bf7dc33e20 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -488,7 +488,7 @@ mod test {
                 "/tmp/index.out".to_string(),
                 false,
                 1024 * 1024, // write_buffer_size: 1MB default
-                ShuffleMode::Default,
+                ShuffleMode::Buffered,
             )
             .unwrap();
 
@@ -548,7 +548,7 @@ mod test {
                 index_file.clone(),
                 false,
                 1024 * 1024,
-                ShuffleMode::Default,
+                ShuffleMode::Buffered,
             )
             .unwrap();
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index d53d5c965c..13c34bd10c 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -194,8 +194,8 @@ class CometNativeShuffleWriter[K, V](
         CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt)
 
       val shuffleMode = CometConf.COMET_NATIVE_SHUFFLE_MODE.get() match {
-        case "immediate" => OperatorOuterClass.ShuffleMode.ImmediateShuffle
-        case _ => OperatorOuterClass.ShuffleMode.DefaultShuffle
+        case "buffered" => OperatorOuterClass.ShuffleMode.BufferedShuffle
+        case _ => OperatorOuterClass.ShuffleMode.ImmediateShuffle
       }
       shuffleWriterBuilder.setShuffleMode(shuffleMode)
 

From fb426cfe08e93aa119a34067c5b5a4bbf0d7e3dc Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 13:56:48 -0600
Subject: [PATCH 13/18] perf: replace per-partition temp files with in-memory
 buffers

Replace per-partition temp files in ImmediateShufflePartitioner with
Vec<u8> buffers holding compressed IPC blocks. Eliminates 200 temp
file creates/writes/reads per task and the double-write overhead that
caused 2x slowdown on clusters with concurrent tasks.

Add --concurrent-tasks flag to shuffle_bench for realistic executor
simulation.

Benchmark (10M rows, 200 partitions, hash on cols 0,3, zstd):
  Single task:       buffered 4.78s vs immediate 3.97s (-17%)
  8 concurrent tasks: buffered 6.75s vs immediate 5.94s (-12%)
---
 native/shuffle/src/bin/shuffle_bench.rs       | 176 ++++++++++++++++--
 .../src/partitioners/immediate_partition.rs   |  83 ++++-----
 native/shuffle/src/shuffle_writer.rs          |   1 -
 .../shuffle/src/writers/buf_batch_writer.rs   |   7 +
 4 files changed, 197 insertions(+), 70 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 4dba0d8c41..67dbb9fcc9 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -119,6 +119,11 @@ struct Args {
     /// Shuffle mode: buffered or immediate
     #[arg(long, default_value = "immediate")]
     shuffle_mode: String,
+
+    /// Number of concurrent shuffle tasks to simulate executor parallelism.
+    /// Each task reads the same input and writes to its own output files.
+    #[arg(long, default_value_t = 1)]
+    concurrent_tasks: usize,
 }
 
 fn main() {
@@ -151,6 +156,9 @@ fn main() {
     if let Some(mem_limit) = args.memory_limit {
         println!("Memory limit:   {}", format_bytes(mem_limit));
     }
+    if args.concurrent_tasks > 1 {
+        println!("Concurrent:     {} tasks", args.concurrent_tasks);
+    }
     println!(
         "Iterations:     {} (warmup: {})",
         args.iterations, args.warmup
@@ -170,15 +178,25 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        let write_elapsed = run_shuffle_write(
-            &args.input,
-            &schema,
-            &codec,
-            &hash_col_indices,
-            &args,
-            data_file.to_str().unwrap(),
-            index_file.to_str().unwrap(),
-        );
+        let write_elapsed = if args.concurrent_tasks > 1 {
+            run_concurrent_shuffle_writes(
+                &args.input,
+                &schema,
+                &codec,
+                &hash_col_indices,
+                &args,
+            )
+        } else {
+            run_shuffle_write(
+                &args.input,
+                &schema,
+                &codec,
+                &hash_col_indices,
+                &args,
+                data_file.to_str().unwrap(),
+                index_file.to_str().unwrap(),
+            )
+        };
         let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0);
 
         if !is_warmup {
@@ -187,9 +205,11 @@ fn main() {
         }
 
         print!("  [{label}] write: {:.3}s", write_elapsed);
-        print!("  output: {}", format_bytes(data_size as usize));
+        if args.concurrent_tasks <= 1 {
+            print!("  output: {}", format_bytes(data_size as usize));
+        }
 
-        if args.read_back {
+        if args.read_back && args.concurrent_tasks <= 1 {
             let read_elapsed = run_shuffle_read(
                 data_file.to_str().unwrap(),
                 index_file.to_str().unwrap(),
@@ -208,8 +228,8 @@ fn main() {
         println!("=== Results ===");
 
         let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
-        let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
-        let write_throughput_rows = total_rows as f64 / avg_write;
+        let write_throughput_rows =
+            (total_rows as f64 * args.concurrent_tasks as f64) / avg_write;
 
         println!("Write:");
         println!("  avg time:         {:.3}s", avg_write);
@@ -222,15 +242,22 @@ fn main() {
             println!("  min/max:          {:.3}s / {:.3}s", min, max);
         }
         println!(
-            "  throughput:       {} rows/s",
-            format_number(write_throughput_rows as usize)
-        );
-        println!(
-            "  output size:      {}",
-            format_bytes(avg_data_size as usize)
+            "  throughput:       {} rows/s (total across {} tasks)",
+            format_number(write_throughput_rows as usize),
+            args.concurrent_tasks
         );
+        if args.concurrent_tasks <= 1 {
+            let avg_data_size =
+                data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+            println!(
+                "  output size:      {}",
+                format_bytes(avg_data_size as usize)
+            );
+        }
 
         if !read_times.is_empty() {
+            let avg_data_size =
+                data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
             let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
             let read_throughput_bytes = avg_data_size as f64 / avg_read;
 
@@ -381,6 +408,117 @@ fn run_shuffle_write(
     })
 }
 
+/// Run N concurrent shuffle writes to simulate executor parallelism.
+/// Returns wall-clock time for all tasks to complete.
+fn run_concurrent_shuffle_writes(
+    input_path: &PathBuf,
+    schema: &SchemaRef,
+    codec: &CompressionCodec,
+    hash_col_indices: &[usize],
+    args: &Args,
+) -> f64 {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(async {
+        let start = Instant::now();
+
+        let mut handles = Vec::with_capacity(args.concurrent_tasks);
+        for task_id in 0..args.concurrent_tasks {
+            let task_dir = args.output_dir.join(format!("task_{task_id}"));
+            fs::create_dir_all(&task_dir).expect("Failed to create task output directory");
+            let data_file = task_dir.join("data.out").to_str().unwrap().to_string();
+            let index_file = task_dir.join("index.out").to_str().unwrap().to_string();
+
+            let input_path = input_path.clone();
+            let schema = Arc::clone(schema);
+            let codec = codec.clone();
+            let hash_col_indices = hash_col_indices.to_vec();
+            let partitioning_scheme = args.partitioning.clone();
+            let num_partitions = args.partitions;
+            let batch_size = args.batch_size;
+            let memory_limit = args.memory_limit;
+            let write_buffer_size = args.write_buffer_size;
+            let shuffle_mode_str = args.shuffle_mode.clone();
+            let limit = args.limit;
+
+            let handle = tokio::spawn(async move {
+                let config = SessionConfig::new().with_batch_size(batch_size);
+                let mut runtime_builder = RuntimeEnvBuilder::new();
+                if let Some(mem_limit) = memory_limit {
+                    runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+                }
+                let runtime_env = Arc::new(runtime_builder.build().unwrap());
+                let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+
+                let path_str = input_path.to_str().unwrap().to_string();
+                let mut df = ctx
+                    .read_parquet(&path_str, ParquetReadOptions::default())
+                    .await
+                    .expect("Failed to create Parquet scan");
+                if limit > 0 {
+                    df = df.limit(0, Some(limit)).unwrap();
+                }
+
+                let parquet_plan = df
+                    .create_physical_plan()
+                    .await
+                    .expect("Failed to create physical plan");
+
+                let input: Arc<dyn ExecutionPlan> = if parquet_plan
+                    .properties()
+                    .output_partitioning()
+                    .partition_count()
+                    > 1
+                {
+                    Arc::new(CoalescePartitionsExec::new(parquet_plan))
+                } else {
+                    parquet_plan
+                };
+
+                let partitioning = build_partitioning(
+                    &partitioning_scheme,
+                    num_partitions,
+                    &hash_col_indices,
+                    &schema,
+                );
+
+                let shuffle_mode = match shuffle_mode_str.as_str() {
+                    "buffered" => ShuffleMode::Buffered,
+                    _ => ShuffleMode::Immediate,
+                };
+
+                let exec = ShuffleWriterExec::try_new(
+                    input,
+                    partitioning,
+                    codec,
+                    data_file,
+                    index_file,
+                    false,
+                    write_buffer_size,
+                    shuffle_mode,
+                )
+                .expect("Failed to create ShuffleWriterExec");
+
+                let task_ctx = ctx.task_ctx();
+                let stream = exec.execute(0, task_ctx).unwrap();
+                collect(stream).await.unwrap();
+            });
+            handles.push(handle);
+        }
+
+        for handle in handles {
+            handle.await.expect("Task panicked");
+        }
+
+        // Clean up task output directories
+        for task_id in 0..args.concurrent_tasks {
+            let task_dir = args.output_dir.join(format!("task_{task_id}"));
+            let _ = fs::remove_dir_all(&task_dir);
+        }
+
+        start.elapsed().as_secs_f64()
+    })
+}
+
 fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
     let start = Instant::now();
 
diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index 65fd746f0b..f32ce5d6a8 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -16,11 +16,15 @@
 // under the License.
 
 //! An immediate-mode shuffle partitioner that repartitions incoming batches and writes
-//! directly to per-partition spill files. Unlike `MultiPartitionShuffleRepartitioner`,
-//! this implementation does not buffer input batches in memory — it uses Arrow `take`
-//! to extract per-partition slices and writes them immediately through per-partition
-//! `BufBatchWriter`s. At `shuffle_write` time, the per-partition files are concatenated
-//! into the final shuffle data file and index.
+//! per-partition data to in-memory buffers containing compressed IPC blocks. Unlike
+//! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow
+//! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and
+//! serializes them immediately through per-partition `BufBatchWriter`s into `Vec<u8>`
+//! buffers. At `shuffle_write` time, the buffers are concatenated into the final
+//! shuffle data file and index.
+//!
+//! Because the buffers hold compressed IPC (typically 5-20% of raw data size), the
+//! memory footprint is much lower than holding raw Arrow batches.
 
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::scratch::ScratchSpace;
@@ -31,32 +35,28 @@ use arrow::array::{ArrayRef, RecordBatch, UInt32Array};
 use arrow::compute::take;
 use arrow::datatypes::SchemaRef;
 use datafusion::common::DataFusionError;
-use datafusion::execution::disk_manager::RefCountedTempFile;
-use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion_comet_common::tracing::{with_trace, with_trace_async};
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::fs::{File, OpenOptions};
-use std::io::{BufWriter, Seek, Write};
-use std::sync::Arc;
+use std::io::{BufWriter, Cursor, Seek, Write};
 use tokio::time::Instant;
 
-struct PartitionFileWriter {
-    temp_file: RefCountedTempFile,
-    writer: BufBatchWriter<ShuffleBlockWriter, File>,
+/// Per-partition in-memory buffer holding compressed IPC blocks.
+struct PartitionBuffer {
+    writer: BufBatchWriter<ShuffleBlockWriter, Cursor<Vec<u8>>>,
 }
 
 /// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using
-/// `arrow::compute::take` and the per-partition slices are written directly to
-/// per-partition temporary files via `BufBatchWriter`. No input batches are retained
-/// in memory beyond the current one.
+/// `arrow::compute::take` and the per-partition slices are serialized immediately into
+/// per-partition `Vec<u8>` buffers (compressed IPC). No input `RecordBatch` objects are
+/// retained in memory beyond the current one.
 pub(crate) struct ImmediateShufflePartitioner {
     output_data_file: String,
     output_index_file: String,
-    partition_writers: Vec<Option<PartitionFileWriter>>,
+    partition_buffers: Vec<Option<PartitionBuffer>>,
     shuffle_block_writer: ShuffleBlockWriter,
     partitioning: CometPartitioning,
-    runtime: Arc<RuntimeEnv>,
     metrics: ShufflePartitionerMetrics,
     scratch: ScratchSpace,
     batch_size: usize,
@@ -72,7 +72,6 @@ impl ImmediateShufflePartitioner {
         schema: SchemaRef,
         partitioning: CometPartitioning,
         metrics: ShufflePartitionerMetrics,
-        runtime: Arc<RuntimeEnv>,
         batch_size: usize,
         codec: CompressionCodec,
         tracing_enabled: bool,
@@ -86,15 +85,14 @@ impl ImmediateShufflePartitioner {
 
         let scratch = ScratchSpace::new(&partitioning, batch_size, num_output_partitions);
         let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?;
-        let partition_writers = (0..num_output_partitions).map(|_| None).collect();
+        let partition_buffers = (0..num_output_partitions).map(|_| None).collect();
 
         Ok(Self {
             output_data_file,
             output_index_file,
-            partition_writers,
+            partition_buffers,
             shuffle_block_writer,
             partitioning,
-            runtime,
             metrics,
             scratch,
             batch_size,
@@ -103,29 +101,16 @@ impl ImmediateShufflePartitioner {
         })
     }
 
-    fn ensure_partition_writer(&mut self, partition_id: usize) -> datafusion::common::Result<()> {
-        if self.partition_writers[partition_id].is_none() {
-            let temp_file = self
-                .runtime
-                .disk_manager
-                .create_tmp_file(&format!("immediate_shuffle_partition_{partition_id}"))?;
-            let file = OpenOptions::new()
-                .write(true)
-                .create(true)
-                .truncate(true)
-                .open(temp_file.path())
-                .map_err(|e| {
-                    DataFusionError::Execution(format!("Error creating partition file: {e}"))
-                })?;
+    fn ensure_partition_buffer(&mut self, partition_id: usize) {
+        if self.partition_buffers[partition_id].is_none() {
             let writer = BufBatchWriter::new(
                 self.shuffle_block_writer.clone(),
-                file,
+                Cursor::new(Vec::new()),
                 self.write_buffer_size,
                 self.batch_size,
             );
-            self.partition_writers[partition_id] = Some(PartitionFileWriter { temp_file, writer });
+            self.partition_buffers[partition_id] = Some(PartitionBuffer { writer });
         }
-        Ok(())
     }
 
     fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> {
@@ -177,9 +162,9 @@ impl ImmediateShufflePartitioner {
 
             let partition_batch = sorted_batch.slice(start, end - start);
 
-            self.ensure_partition_writer(partition_id)?;
-            let pw = self.partition_writers[partition_id].as_mut().unwrap();
-            pw.writer.write(
+            self.ensure_partition_buffer(partition_id);
+            let pb = self.partition_buffers[partition_id].as_mut().unwrap();
+            pb.writer.write(
                 &partition_batch,
                 &self.metrics.encode_time,
                 &self.metrics.write_time,
@@ -221,7 +206,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
         with_trace("immediate_shuffle_write", self.tracing_enabled, || {
             let start_time = Instant::now();
 
-            let num_output_partitions = self.partition_writers.len();
+            let num_output_partitions = self.partition_buffers.len();
             let mut offsets = vec![0u64; num_output_partitions + 1];
 
             let output_data = OpenOptions::new()
@@ -233,18 +218,16 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
 
             let mut output_data = BufWriter::new(output_data);
 
-            for (partition_id, pw_slot) in self.partition_writers.iter_mut().enumerate() {
+            for (partition_id, pb_slot) in self.partition_buffers.iter_mut().enumerate() {
                 offsets[partition_id] = output_data.stream_position()?;
 
-                if let Some(mut pw) = pw_slot.take() {
-                    pw.writer
+                if let Some(mut pb) = pb_slot.take() {
+                    pb.writer
                         .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
-                    // Reopen for read without BufReader to allow kernel zero-copy
-                    // (copy_file_range/sendfile) on supported platforms
-                    let mut spill_file = File::open(pw.temp_file.path())?;
+                    let buf = pb.writer.into_writer().into_inner();
                     let mut write_timer = self.metrics.write_time.timer();
-                    std::io::copy(&mut spill_file, &mut output_data)?;
+                    output_data.write_all(&buf)?;
                     write_timer.stop();
                 }
             }
@@ -279,7 +262,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
 impl Debug for ImmediateShufflePartitioner {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         f.debug_struct("ImmediateShufflePartitioner")
-            .field("num_partitions", &self.partition_writers.len())
+            .field("num_partitions", &self.partition_buffers.len())
             .finish()
     }
 }
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index bf7dc33e20..1079f4fc12 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -241,7 +241,6 @@ async fn external_shuffle(
                     Arc::clone(&schema),
                     partitioning,
                     metrics,
-                    context.runtime_env(),
                     context.session_config().batch_size(),
                     codec,
                     tracing_enabled,
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index cfddb46539..939af362b9 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -135,6 +135,13 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
     }
 }
 
+impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
+    /// Consume this BufBatchWriter and return the underlying writer.
+    pub(crate) fn into_writer(self) -> W {
+        self.writer
+    }
+}
+
 impl<S: Borrow<ShuffleBlockWriter>, W: Write + Seek> BufBatchWriter<S, W> {
     pub(crate) fn writer_stream_position(&mut self) -> datafusion::common::Result<u64> {
         self.writer.stream_position().map_err(Into::into)

From 705130d9094ce8b5f4c710cd2d9ac4dcbc30ba7f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 14:05:27 -0600
Subject: [PATCH 14/18] feat: add memory accounting and spilling to immediate
 shuffle

Register MemoryReservation with DataFusion's memory pool. After each
batch, track in-memory buffer growth. When try_grow fails (memory
pressure), spill all partition buffers to per-partition temp files,
clear the Vec<u8> buffers, and free the reservation. In shuffle_write,
copy spill files first then remaining in-memory data.

Benchmark (10M rows, 200 partitions, zstd):
  No limit:    3.98s (2.51M rows/s)
  50MB limit:  4.33s (2.31M rows/s) -- 9% spilling overhead
---
 native/shuffle/src/bin/shuffle_bench.rs       |  17 +-
 .../src/partitioners/immediate_partition.rs   | 155 ++++++++++++++++--
 native/shuffle/src/shuffle_writer.rs          |   2 +
 .../shuffle/src/writers/buf_batch_writer.rs   |  16 ++
 4 files changed, 163 insertions(+), 27 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 67dbb9fcc9..079a890a19 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -179,13 +179,7 @@ fn main() {
         };
 
         let write_elapsed = if args.concurrent_tasks > 1 {
-            run_concurrent_shuffle_writes(
-                &args.input,
-                &schema,
-                &codec,
-                &hash_col_indices,
-                &args,
-            )
+            run_concurrent_shuffle_writes(&args.input, &schema, &codec, &hash_col_indices, &args)
         } else {
             run_shuffle_write(
                 &args.input,
@@ -228,8 +222,7 @@ fn main() {
         println!("=== Results ===");
 
         let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
-        let write_throughput_rows =
-            (total_rows as f64 * args.concurrent_tasks as f64) / avg_write;
+        let write_throughput_rows = (total_rows as f64 * args.concurrent_tasks as f64) / avg_write;
 
         println!("Write:");
         println!("  avg time:         {:.3}s", avg_write);
@@ -247,8 +240,7 @@ fn main() {
             args.concurrent_tasks
         );
         if args.concurrent_tasks <= 1 {
-            let avg_data_size =
-                data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+            let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
             println!(
                 "  output size:      {}",
                 format_bytes(avg_data_size as usize)
@@ -256,8 +248,7 @@ fn main() {
         }
 
         if !read_times.is_empty() {
-            let avg_data_size =
-                data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+            let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
             let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
             let read_throughput_bytes = avg_data_size as f64 / avg_read;
 
diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index f32ce5d6a8..59393391d7 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -20,11 +20,9 @@
 //! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow
 //! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and
 //! serializes them immediately through per-partition `BufBatchWriter`s into `Vec<u8>`
-//! buffers. At `shuffle_write` time, the buffers are concatenated into the final
-//! shuffle data file and index.
-//!
-//! Because the buffers hold compressed IPC (typically 5-20% of raw data size), the
-//! memory footprint is much lower than holding raw Arrow batches.
+//! buffers. Under memory pressure, buffers are spilled to per-partition temp files.
+//! At `shuffle_write` time, spill files (if any) and remaining in-memory buffers are
+//! concatenated into the final shuffle data file and index.
 
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::scratch::ScratchSpace;
@@ -35,31 +33,46 @@ use arrow::array::{ArrayRef, RecordBatch, UInt32Array};
 use arrow::compute::take;
 use arrow::datatypes::SchemaRef;
 use datafusion::common::DataFusionError;
+use datafusion::execution::disk_manager::RefCountedTempFile;
+use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion_comet_common::tracing::{with_trace, with_trace_async};
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::fs::{File, OpenOptions};
 use std::io::{BufWriter, Cursor, Seek, Write};
+use std::sync::Arc;
 use tokio::time::Instant;
 
-/// Per-partition in-memory buffer holding compressed IPC blocks.
+/// Per-partition state: in-memory buffer for compressed IPC, plus an optional
+/// spill file from prior memory-pressure events.
 struct PartitionBuffer {
     writer: BufBatchWriter<ShuffleBlockWriter, Cursor<Vec<u8>>>,
+    /// Spill file accumulating data from prior spill events. Created lazily
+    /// on first spill. Subsequent spills append to the same file.
+    spill_file: Option<SpillFile>,
 }
 
-/// An immediate-mode shuffle partitioner. Each incoming batch is repartitioned using
-/// `arrow::compute::take` and the per-partition slices are serialized immediately into
-/// per-partition `Vec<u8>` buffers (compressed IPC). No input `RecordBatch` objects are
-/// retained in memory beyond the current one.
+struct SpillFile {
+    _temp_file: RefCountedTempFile,
+    file: File,
+}
+
+/// An immediate-mode shuffle partitioner with memory accounting and spilling.
+/// Each incoming batch is repartitioned using `arrow::compute::take` and the
+/// per-partition slices are serialized into per-partition `Vec<u8>` buffers
+/// (compressed IPC). Under memory pressure, buffers are spilled to temp files.
 pub(crate) struct ImmediateShufflePartitioner {
     output_data_file: String,
     output_index_file: String,
     partition_buffers: Vec<Option<PartitionBuffer>>,
     shuffle_block_writer: ShuffleBlockWriter,
     partitioning: CometPartitioning,
+    runtime: Arc<RuntimeEnv>,
     metrics: ShufflePartitionerMetrics,
     scratch: ScratchSpace,
     batch_size: usize,
+    reservation: MemoryReservation,
     tracing_enabled: bool,
     write_buffer_size: usize,
 }
@@ -67,11 +80,13 @@ pub(crate) struct ImmediateShufflePartitioner {
 impl ImmediateShufflePartitioner {
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn try_new(
+        partition: usize,
         output_data_file: String,
         output_index_file: String,
         schema: SchemaRef,
         partitioning: CometPartitioning,
         metrics: ShufflePartitionerMetrics,
+        runtime: Arc<RuntimeEnv>,
         batch_size: usize,
         codec: CompressionCodec,
         tracing_enabled: bool,
@@ -87,15 +102,21 @@ impl ImmediateShufflePartitioner {
         let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?;
         let partition_buffers = (0..num_output_partitions).map(|_| None).collect();
 
+        let reservation = MemoryConsumer::new(format!("ImmediateShufflePartitioner[{partition}]"))
+            .with_can_spill(true)
+            .register(&runtime.memory_pool);
+
         Ok(Self {
             output_data_file,
             output_index_file,
             partition_buffers,
             shuffle_block_writer,
             partitioning,
+            runtime,
             metrics,
             scratch,
             batch_size,
+            reservation,
             tracing_enabled,
             write_buffer_size,
         })
@@ -109,8 +130,92 @@ impl ImmediateShufflePartitioner {
                 self.write_buffer_size,
                 self.batch_size,
             );
-            self.partition_buffers[partition_id] = Some(PartitionBuffer { writer });
+            self.partition_buffers[partition_id] = Some(PartitionBuffer {
+                writer,
+                spill_file: None,
+            });
+        }
+    }
+
+    /// Spill all in-memory partition buffers to disk.
+    fn spill(&mut self) -> datafusion::common::Result<()> {
+        let total_buffered: usize = self
+            .partition_buffers
+            .iter()
+            .filter_map(|pb| pb.as_ref())
+            .map(|pb| pb.writer.inner_buffer_len())
+            .sum();
+
+        if total_buffered == 0 {
+            return Ok(());
+        }
+
+        log::info!(
+            "ImmediateShufflePartitioner spilling {} to disk ({} time(s) so far)",
+            total_buffered,
+            self.metrics.spill_count.value()
+        );
+
+        let mut spilled_bytes = 0usize;
+
+        for pb in self.partition_buffers.iter_mut().flatten() {
+            // Flush the BufBatchWriter's coalescer into the Cursor<Vec<u8>>
+            pb.writer
+                .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
+
+            let buf = pb.writer.inner_mut().get_ref();
+            if buf.is_empty() {
+                continue;
+            }
+
+            // Ensure spill file exists
+            if pb.spill_file.is_none() {
+                let temp_file = self
+                    .runtime
+                    .disk_manager
+                    .create_tmp_file("immediate_shuffle_spill")?;
+                let file = OpenOptions::new()
+                    .write(true)
+                    .create(true)
+                    .truncate(true)
+                    .open(temp_file.path())
+                    .map_err(|e| {
+                        DataFusionError::Execution(format!("Error creating spill file: {e}"))
+                    })?;
+                pb.spill_file = Some(SpillFile {
+                    _temp_file: temp_file,
+                    file,
+                });
+            }
+
+            // Append buffer contents to spill file
+            let spill = pb.spill_file.as_mut().unwrap();
+            let mut write_timer = self.metrics.write_time.timer();
+            spill.file.write_all(buf)?;
+            write_timer.stop();
+            spilled_bytes += buf.len();
+
+            // Reset the in-memory buffer (keep allocated capacity for reuse)
+            let cursor = pb.writer.inner_mut();
+            cursor.get_mut().clear();
+            cursor.set_position(0);
         }
+
+        self.reservation.free();
+        self.metrics.spill_count.add(1);
+        self.metrics.spilled_bytes.add(spilled_bytes);
+
+        Ok(())
+    }
+
+    /// Returns the total size of in-memory buffers across all partitions.
+    /// Includes both the BufBatchWriter staging buffer and the Cursor<Vec<u8>> output.
+    fn total_buffer_size(&self) -> usize {
+        self.partition_buffers
+            .iter()
+            .filter_map(|pb| pb.as_ref())
+            .map(|pb| pb.writer.inner_buffer_len() + pb.writer.inner_ref().get_ref().len())
+            .sum()
     }
 
     fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> {
@@ -153,6 +258,8 @@ impl ImmediateShufflePartitioner {
             .collect::<datafusion::common::Result<Vec<_>>>()?;
         let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?;
 
+        let size_before = self.total_buffer_size();
+
         for partition_id in 0..num_output_partitions {
             let start = scratch.partition_starts[partition_id] as usize;
             let end = scratch.partition_starts[partition_id + 1] as usize;
@@ -171,6 +278,12 @@ impl ImmediateShufflePartitioner {
             )?;
         }
 
+        let size_after = self.total_buffer_size();
+        let mem_growth = size_after.saturating_sub(size_before);
+        if mem_growth > 0 && self.reservation.try_grow(mem_growth).is_err() {
+            self.spill()?;
+        }
+
         self.scratch = scratch;
         Ok(())
     }
@@ -222,13 +335,26 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
                 offsets[partition_id] = output_data.stream_position()?;
 
                 if let Some(mut pb) = pb_slot.take() {
+                    // Copy spill file contents first (from prior spill events)
+                    if let Some(mut spill) = pb.spill_file.take() {
+                        spill.file.flush()?;
+                        let spill_path = spill._temp_file.path().to_path_buf();
+                        let mut spill_reader = File::open(&spill_path)?;
+                        let mut write_timer = self.metrics.write_time.timer();
+                        std::io::copy(&mut spill_reader, &mut output_data)?;
+                        write_timer.stop();
+                    }
+
+                    // Flush and write remaining in-memory buffer
                     pb.writer
                         .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
                     let buf = pb.writer.into_writer().into_inner();
-                    let mut write_timer = self.metrics.write_time.timer();
-                    output_data.write_all(&buf)?;
-                    write_timer.stop();
+                    if !buf.is_empty() {
+                        let mut write_timer = self.metrics.write_time.timer();
+                        output_data.write_all(&buf)?;
+                        write_timer.stop();
+                    }
                 }
             }
 
@@ -263,6 +389,7 @@ impl Debug for ImmediateShufflePartitioner {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         f.debug_struct("ImmediateShufflePartitioner")
             .field("num_partitions", &self.partition_buffers.len())
+            .field("memory_used", &self.reservation.size())
             .finish()
     }
 }
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index 1079f4fc12..67314b4961 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -236,11 +236,13 @@ async fn external_shuffle(
             }
             _ if shuffle_mode == ShuffleMode::Immediate => {
                 Box::new(ImmediateShufflePartitioner::try_new(
+                    partition,
                     output_data_file,
                     output_index_file,
                     Arc::clone(&schema),
                     partitioning,
                     metrics,
+                    context.runtime_env(),
                     context.session_config().batch_size(),
                     codec,
                     tracing_enabled,
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index 939af362b9..407df4ee9d 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -140,6 +140,22 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
     pub(crate) fn into_writer(self) -> W {
         self.writer
     }
+
+    /// Returns a reference to the underlying writer.
+    pub(crate) fn inner_ref(&self) -> &W {
+        &self.writer
+    }
+
+    /// Returns a mutable reference to the underlying writer.
+    pub(crate) fn inner_mut(&mut self) -> &mut W {
+        &mut self.writer
+    }
+
+    /// Returns the total buffered size: the internal byte buffer plus
+    /// the BufBatchWriter's own serialization buffer.
+    pub(crate) fn inner_buffer_len(&self) -> usize {
+        self.buffer.len()
+    }
 }
 
 impl<S: Borrow<ShuffleBlockWriter>, W: Write + Seek> BufBatchWriter<S, W> {

From 862d01e0bcf9c5b758137dfc0afac9466558e315 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 14:12:50 -0600
Subject: [PATCH 15/18] refactor: encapsulate buffer access and extract shared
 index writer

- Replace leaky inner_ref/inner_mut/inner_buffer_len methods on
  BufBatchWriter with encapsulated Cursor<Vec<u8>>-specific methods:
  buffered_output_size, reset_output_buffer, output_bytes
- Extract write_index_file utility, replacing duplicate code in all
  three partitioners
- Rename _temp_file to temp_file (RAII guard, not unused)
- Remove unnecessary comments
---
 .../src/partitioners/immediate_partition.rs   | 73 ++++---------------
 .../src/partitioners/multi_partition.rs       |  9 +--
 .../src/partitioners/single_partition.rs      | 14 +---
 .../shuffle/src/writers/buf_batch_writer.rs   | 26 ++++---
 native/shuffle/src/writers/mod.rs             | 17 +++++
 5 files changed, 50 insertions(+), 89 deletions(-)

diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index 59393391d7..44a42e3fa4 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -15,19 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! An immediate-mode shuffle partitioner that repartitions incoming batches and writes
-//! per-partition data to in-memory buffers containing compressed IPC blocks. Unlike
-//! `MultiPartitionShuffleRepartitioner`, this implementation does not buffer raw Arrow
-//! `RecordBatch` objects — it uses Arrow `take` to extract per-partition slices and
-//! serializes them immediately through per-partition `BufBatchWriter`s into `Vec<u8>`
-//! buffers. Under memory pressure, buffers are spilled to per-partition temp files.
-//! At `shuffle_write` time, spill files (if any) and remaining in-memory buffers are
-//! concatenated into the final shuffle data file and index.
+//! Immediate-mode shuffle partitioner: repartitions each incoming batch via Arrow `take`,
+//! serializes per-partition slices into in-memory compressed IPC buffers, and spills to
+//! disk under memory pressure.
 
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::scratch::ScratchSpace;
 use crate::partitioners::ShufflePartitioner;
-use crate::writers::BufBatchWriter;
+use crate::writers::{write_index_file, BufBatchWriter};
 use crate::{CometPartitioning, CompressionCodec, ShuffleBlockWriter};
 use arrow::array::{ArrayRef, RecordBatch, UInt32Array};
 use arrow::compute::take;
@@ -44,24 +39,17 @@ use std::io::{BufWriter, Cursor, Seek, Write};
 use std::sync::Arc;
 use tokio::time::Instant;
 
-/// Per-partition state: in-memory buffer for compressed IPC, plus an optional
-/// spill file from prior memory-pressure events.
 struct PartitionBuffer {
     writer: BufBatchWriter<ShuffleBlockWriter, Cursor<Vec<u8>>>,
-    /// Spill file accumulating data from prior spill events. Created lazily
-    /// on first spill. Subsequent spills append to the same file.
     spill_file: Option<SpillFile>,
 }
 
 struct SpillFile {
-    _temp_file: RefCountedTempFile,
+    /// Kept alive to prevent temp file deletion (RAII guard).
+    temp_file: RefCountedTempFile,
     file: File,
 }
 
-/// An immediate-mode shuffle partitioner with memory accounting and spilling.
-/// Each incoming batch is repartitioned using `arrow::compute::take` and the
-/// per-partition slices are serialized into per-partition `Vec<u8>` buffers
-/// (compressed IPC). Under memory pressure, buffers are spilled to temp files.
 pub(crate) struct ImmediateShufflePartitioner {
     output_data_file: String,
     output_index_file: String,
@@ -137,38 +125,27 @@ impl ImmediateShufflePartitioner {
         }
     }
 
-    /// Spill all in-memory partition buffers to disk.
     fn spill(&mut self) -> datafusion::common::Result<()> {
-        let total_buffered: usize = self
-            .partition_buffers
-            .iter()
-            .filter_map(|pb| pb.as_ref())
-            .map(|pb| pb.writer.inner_buffer_len())
-            .sum();
-
-        if total_buffered == 0 {
+        if self.partition_buffers.iter().all(|pb| pb.is_none()) {
             return Ok(());
         }
 
         log::info!(
-            "ImmediateShufflePartitioner spilling {} to disk ({} time(s) so far)",
-            total_buffered,
+            "ImmediateShufflePartitioner spilling to disk ({} time(s) so far)",
             self.metrics.spill_count.value()
         );
 
         let mut spilled_bytes = 0usize;
 
         for pb in self.partition_buffers.iter_mut().flatten() {
-            // Flush the BufBatchWriter's coalescer into the Cursor<Vec<u8>>
             pb.writer
                 .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
-            let buf = pb.writer.inner_mut().get_ref();
+            let buf = pb.writer.output_bytes();
             if buf.is_empty() {
                 continue;
             }
 
-            // Ensure spill file exists
             if pb.spill_file.is_none() {
                 let temp_file = self
                     .runtime
@@ -182,23 +159,16 @@ impl ImmediateShufflePartitioner {
                     .map_err(|e| {
                         DataFusionError::Execution(format!("Error creating spill file: {e}"))
                     })?;
-                pb.spill_file = Some(SpillFile {
-                    _temp_file: temp_file,
-                    file,
-                });
+                pb.spill_file = Some(SpillFile { temp_file, file });
             }
 
-            // Append buffer contents to spill file
             let spill = pb.spill_file.as_mut().unwrap();
             let mut write_timer = self.metrics.write_time.timer();
             spill.file.write_all(buf)?;
             write_timer.stop();
             spilled_bytes += buf.len();
 
-            // Reset the in-memory buffer (keep allocated capacity for reuse)
-            let cursor = pb.writer.inner_mut();
-            cursor.get_mut().clear();
-            cursor.set_position(0);
+            pb.writer.reset_output_buffer();
         }
 
         self.reservation.free();
@@ -208,13 +178,11 @@ impl ImmediateShufflePartitioner {
         Ok(())
     }
 
-    /// Returns the total size of in-memory buffers across all partitions.
-    /// Includes both the BufBatchWriter staging buffer and the Cursor<Vec<u8>> output.
     fn total_buffer_size(&self) -> usize {
         self.partition_buffers
             .iter()
             .filter_map(|pb| pb.as_ref())
-            .map(|pb| pb.writer.inner_buffer_len() + pb.writer.inner_ref().get_ref().len())
+            .map(|pb| pb.writer.buffered_output_size())
             .sum()
     }
 
@@ -278,8 +246,7 @@ impl ImmediateShufflePartitioner {
             )?;
         }
 
-        let size_after = self.total_buffer_size();
-        let mem_growth = size_after.saturating_sub(size_before);
+        let mem_growth = self.total_buffer_size().saturating_sub(size_before);
         if mem_growth > 0 && self.reservation.try_grow(mem_growth).is_err() {
             self.spill()?;
         }
@@ -335,17 +302,14 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
                 offsets[partition_id] = output_data.stream_position()?;
 
                 if let Some(mut pb) = pb_slot.take() {
-                    // Copy spill file contents first (from prior spill events)
                     if let Some(mut spill) = pb.spill_file.take() {
                         spill.file.flush()?;
-                        let spill_path = spill._temp_file.path().to_path_buf();
-                        let mut spill_reader = File::open(&spill_path)?;
+                        let mut spill_reader = File::open(spill.temp_file.path())?;
                         let mut write_timer = self.metrics.write_time.timer();
                         std::io::copy(&mut spill_reader, &mut output_data)?;
                         write_timer.stop();
                     }
 
-                    // Flush and write remaining in-memory buffer
                     pb.writer
                         .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
@@ -365,14 +329,7 @@ impl ShufflePartitioner for ImmediateShufflePartitioner {
             offsets[num_output_partitions] = output_data.stream_position()?;
 
             let mut write_timer = self.metrics.write_time.timer();
-            let mut output_index =
-                BufWriter::new(File::create(&self.output_index_file).map_err(|e| {
-                    DataFusionError::Execution(format!("shuffle write error: {e:?}"))
-                })?);
-            for offset in offsets {
-                output_index.write_all(&(offset as i64).to_le_bytes()[..])?;
-            }
-            output_index.flush()?;
+            write_index_file(&self.output_index_file, &offsets)?;
             write_timer.stop();
 
             self.metrics
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 8a5007142c..ccc3ed225e 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -371,14 +371,7 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner {
             offsets[num_output_partitions] = output_data.stream_position()?;
 
             let mut write_timer = self.metrics.write_time.timer();
-            let mut output_index =
-                BufWriter::new(File::create(index_file).map_err(|e| {
-                    DataFusionError::Execution(format!("shuffle write error: {e:?}"))
-                })?);
-            for offset in offsets {
-                output_index.write_all(&(offset as i64).to_le_bytes()[..])?;
-            }
-            output_index.flush()?;
+            crate::writers::write_index_file(&index_file, &offsets)?;
             write_timer.stop();
 
             self.metrics
diff --git a/native/shuffle/src/partitioners/single_partition.rs b/native/shuffle/src/partitioners/single_partition.rs
index 5801ef613b..778c1d9416 100644
--- a/native/shuffle/src/partitioners/single_partition.rs
+++ b/native/shuffle/src/partitioners/single_partition.rs
@@ -23,7 +23,6 @@ use arrow::array::RecordBatch;
 use arrow::datatypes::SchemaRef;
 use datafusion::common::DataFusionError;
 use std::fs::{File, OpenOptions};
-use std::io::{BufWriter, Write};
 use tokio::time::Instant;
 
 /// A partitioner that writes all shuffle data to a single file and a single index file
@@ -169,19 +168,8 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
         self.output_data_writer
             .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
-        // Write index file. It should only contain 2 entries: 0 and the total number of bytes written
-        let index_file = OpenOptions::new()
-            .write(true)
-            .create(true)
-            .truncate(true)
-            .open(self.output_index_path.clone())
-            .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?;
-        let mut index_buf_writer = BufWriter::new(index_file);
         let data_file_length = self.output_data_writer.writer_stream_position()?;
-        for offset in [0, data_file_length] {
-            index_buf_writer.write_all(&(offset as i64).to_le_bytes()[..])?;
-        }
-        index_buf_writer.flush()?;
+        crate::writers::write_index_file(&self.output_index_path, &[0, data_file_length])?;
 
         self.metrics
             .baseline
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index 407df4ee9d..84f658240d 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -140,21 +140,27 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
     pub(crate) fn into_writer(self) -> W {
         self.writer
     }
+}
 
-    /// Returns a reference to the underlying writer.
-    pub(crate) fn inner_ref(&self) -> &W {
-        &self.writer
+impl<S: Borrow<ShuffleBlockWriter>> BufBatchWriter<S, Cursor<Vec<u8>>> {
+    /// Returns the total bytes buffered: the staging buffer (pre-flush to writer)
+    /// plus the accumulated output in the Cursor's Vec.
+    pub(crate) fn buffered_output_size(&self) -> usize {
+        self.buffer.len() + self.writer.get_ref().len()
     }
 
-    /// Returns a mutable reference to the underlying writer.
-    pub(crate) fn inner_mut(&mut self) -> &mut W {
-        &mut self.writer
+    /// Reset the output buffer, keeping allocated capacity for reuse.
+    /// Returns the number of bytes that were in the output buffer.
+    pub(crate) fn reset_output_buffer(&mut self) -> usize {
+        let len = self.writer.get_ref().len();
+        self.writer.get_mut().clear();
+        self.writer.set_position(0);
+        len
     }
 
-    /// Returns the total buffered size: the internal byte buffer plus
-    /// the BufBatchWriter's own serialization buffer.
-    pub(crate) fn inner_buffer_len(&self) -> usize {
-        self.buffer.len()
+    /// Returns a reference to the accumulated output bytes.
+    pub(crate) fn output_bytes(&self) -> &[u8] {
+        self.writer.get_ref()
     }
 }
 
diff --git a/native/shuffle/src/writers/mod.rs b/native/shuffle/src/writers/mod.rs
index 75caf9f3a3..8de3c7ec9d 100644
--- a/native/shuffle/src/writers/mod.rs
+++ b/native/shuffle/src/writers/mod.rs
@@ -24,3 +24,20 @@ pub(crate) use buf_batch_writer::BufBatchWriter;
 pub(crate) use checksum::Checksum;
 pub use shuffle_block_writer::{CompressionCodec, ShuffleBlockWriter};
 pub(crate) use spill::PartitionWriter;
+
+use datafusion::common::DataFusionError;
+use std::fs::File;
+use std::io::{BufWriter, Write};
+
+/// Write shuffle index file: an array of i64 little-endian byte offsets.
+pub(crate) fn write_index_file(path: &str, offsets: &[u64]) -> datafusion::common::Result<()> {
+    let mut writer = BufWriter::new(
+        File::create(path)
+            .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?,
+    );
+    for &offset in offsets {
+        writer.write_all(&(offset as i64).to_le_bytes()[..])?;
+    }
+    writer.flush()?;
+    Ok(())
+}

From e7d38daaef77de89cd23db306fd8e78c9e93fd6a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 15:14:51 -0600
Subject: [PATCH 16/18] fix: use per-partition take instead of full-batch
 reorder

The take-then-slice approach (single take to reorder entire batch,
then zero-copy slice per partition) caused the BatchCoalescer to
copy all data again via copy_rows, resulting in 2x data movement.
When combined with concat_batches (attempted optimization), it
produced 66 GiB output instead of 396 MiB due to missing coalescing.

Switch to per-partition take with shared UInt32Array indices (sliced
per partition to avoid allocation). Each partition's small batch
(~40 rows) goes through BufBatchWriter's BatchCoalescer which
handles proper IPC block coalescing and compression.

Benchmark (10M rows, 200 partitions, zstd):
  buffered:  4.84s (2.07M rows/s)
  immediate: 4.38s (2.28M rows/s) -- 10% faster
---
 .../src/partitioners/immediate_partition.rs   | 52 +++++++++----------
 .../shuffle/src/writers/buf_batch_writer.rs   | 10 ++--
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index 44a42e3fa4..f69e5addc1 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Immediate-mode shuffle partitioner: repartitions each incoming batch via Arrow `take`,
-//! serializes per-partition slices into in-memory compressed IPC buffers, and spills to
-//! disk under memory pressure.
+//! Immediate-mode shuffle partitioner: repartitions each incoming batch via per-partition
+//! Arrow `take`, serializes through per-partition `BufBatchWriter`s into in-memory
+//! compressed IPC buffers, and spills to disk under memory pressure.
 
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::scratch::ScratchSpace;
@@ -126,10 +126,6 @@ impl ImmediateShufflePartitioner {
     }
 
     fn spill(&mut self) -> datafusion::common::Result<()> {
-        if self.partition_buffers.iter().all(|pb| pb.is_none()) {
-            return Ok(());
-        }
-
         log::info!(
             "ImmediateShufflePartitioner spilling to disk ({} time(s) so far)",
             self.metrics.spill_count.value()
@@ -178,14 +174,6 @@ impl ImmediateShufflePartitioner {
         Ok(())
     }
 
-    fn total_buffer_size(&self) -> usize {
-        self.partition_buffers
-            .iter()
-            .filter_map(|pb| pb.as_ref())
-            .map(|pb| pb.writer.buffered_output_size())
-            .sum()
-    }
-
     fn partitioning_batch(&mut self, input: RecordBatch) -> datafusion::common::Result<()> {
         if input.num_rows() == 0 {
             return Ok(());
@@ -210,21 +198,14 @@ impl ImmediateShufflePartitioner {
             timer.stop();
         }
 
-        // Single take per column to reorder entire batch by partition assignment,
-        // then zero-copy slice per partition. This replaces P*C take calls with just C.
+        // Build a single UInt32Array from partition_row_indices and slice per partition
+        // to avoid per-partition allocation. Per-partition take produces small batches
+        // (~40 rows) that the BufBatchWriter's BatchCoalescer accumulates into batch_size
+        // chunks before serializing to compressed IPC.
         let num_rows = input.num_rows();
         let all_indices = UInt32Array::from_iter_values(
             scratch.partition_row_indices[..num_rows].iter().copied(),
         );
-        let sorted_columns: Vec<ArrayRef> = input
-            .columns()
-            .iter()
-            .map(|col| {
-                take(col, &all_indices, None)
-                    .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
-            })
-            .collect::<datafusion::common::Result<Vec<_>>>()?;
-        let sorted_batch = RecordBatch::try_new(input.schema(), sorted_columns)?;
 
         let size_before = self.total_buffer_size();
 
@@ -235,7 +216,16 @@ impl ImmediateShufflePartitioner {
                 continue;
             }
 
-            let partition_batch = sorted_batch.slice(start, end - start);
+            let indices = all_indices.slice(start, end - start);
+            let columns: Vec<ArrayRef> = input
+                .columns()
+                .iter()
+                .map(|col| {
+                    take(col, &indices, None)
+                        .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
+                })
+                .collect::<datafusion::common::Result<Vec<_>>>()?;
+            let partition_batch = RecordBatch::try_new(input.schema(), columns)?;
 
             self.ensure_partition_buffer(partition_id);
             let pb = self.partition_buffers[partition_id].as_mut().unwrap();
@@ -254,6 +244,14 @@ impl ImmediateShufflePartitioner {
         self.scratch = scratch;
         Ok(())
     }
+
+    fn total_buffer_size(&self) -> usize {
+        self.partition_buffers
+            .iter()
+            .filter_map(|pb| pb.as_ref())
+            .map(|pb| pb.writer.buffered_output_size())
+            .sum()
+    }
 }
 
 #[async_trait::async_trait]
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index 84f658240d..0b10396ead 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -143,22 +143,18 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
 }
 
 impl<S: Borrow<ShuffleBlockWriter>> BufBatchWriter<S, Cursor<Vec<u8>>> {
-    /// Returns the total bytes buffered: the staging buffer (pre-flush to writer)
-    /// plus the accumulated output in the Cursor's Vec.
+    /// Total bytes buffered: staging buffer + accumulated compressed IPC output.
     pub(crate) fn buffered_output_size(&self) -> usize {
         self.buffer.len() + self.writer.get_ref().len()
     }
 
     /// Reset the output buffer, keeping allocated capacity for reuse.
-    /// Returns the number of bytes that were in the output buffer.
-    pub(crate) fn reset_output_buffer(&mut self) -> usize {
-        let len = self.writer.get_ref().len();
+    pub(crate) fn reset_output_buffer(&mut self) {
         self.writer.get_mut().clear();
         self.writer.set_position(0);
-        len
     }
 
-    /// Returns a reference to the accumulated output bytes.
+    /// Returns a reference to the accumulated compressed IPC output.
     pub(crate) fn output_bytes(&self) -> &[u8] {
         self.writer.get_ref()
     }

From 86bae923cc5d45503bd8fd1f0fae356fb692c038 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 15:53:58 -0600
Subject: [PATCH 17/18] revert: default shuffle mode back to buffered

---
 common/src/main/scala/org/apache/comet/CometConf.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index 908c5638c1..3f8d6bd221 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -545,7 +545,7 @@ object CometConf extends ShimCometConf {
           "data directly to individual files, avoiding in-memory buffering of input batches.")
       .stringConf
       .checkValues(Set("buffered", "immediate"))
-      .createWithDefault("immediate")
+      .createWithDefault("buffered")
 
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
     "spark.comet.shuffle.preferDictionary.ratio")

From 3aade30844a57cd065054d34677dd440646b60c3 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sun, 29 Mar 2026 14:39:05 -0600
Subject: [PATCH 18/18] perf: single take() + zero-copy slice() instead of
 per-partition take()

---
 .../src/partitioners/immediate_partition.rs   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/native/shuffle/src/partitioners/immediate_partition.rs b/native/shuffle/src/partitioners/immediate_partition.rs
index f69e5addc1..1bad02155f 100644
--- a/native/shuffle/src/partitioners/immediate_partition.rs
+++ b/native/shuffle/src/partitioners/immediate_partition.rs
@@ -198,15 +198,24 @@ impl ImmediateShufflePartitioner {
             timer.stop();
         }
 
-        // Build a single UInt32Array from partition_row_indices and slice per partition
-        // to avoid per-partition allocation. Per-partition take produces small batches
-        // (~40 rows) that the BufBatchWriter's BatchCoalescer accumulates into batch_size
-        // chunks before serializing to compressed IPC.
+        // Reorder the entire batch once by partition using a single take(), then
+        // use zero-copy slice() per partition. This replaces N per-partition take()
+        // calls (each allocating new arrays) with 1 take() + N free slice() calls.
         let num_rows = input.num_rows();
         let all_indices = UInt32Array::from_iter_values(
             scratch.partition_row_indices[..num_rows].iter().copied(),
         );
 
+        let reordered_columns: Vec<ArrayRef> = input
+            .columns()
+            .iter()
+            .map(|col| {
+                take(col, &all_indices, None)
+                    .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
+            })
+            .collect::<datafusion::common::Result<Vec<_>>>()?;
+        let reordered = RecordBatch::try_new(input.schema(), reordered_columns)?;
+
         let size_before = self.total_buffer_size();
 
         for partition_id in 0..num_output_partitions {
@@ -216,16 +225,7 @@ impl ImmediateShufflePartitioner {
                 continue;
             }
 
-            let indices = all_indices.slice(start, end - start);
-            let columns: Vec<ArrayRef> = input
-                .columns()
-                .iter()
-                .map(|col| {
-                    take(col, &indices, None)
-                        .map_err(|e| DataFusionError::ArrowError(Box::from(e), None))
-                })
-                .collect::<datafusion::common::Result<Vec<_>>>()?;
-            let partition_batch = RecordBatch::try_new(input.schema(), columns)?;
+            let partition_batch = reordered.slice(start, end - start);
 
             self.ensure_partition_buffer(partition_id);
             let pb = self.partition_buffers[partition_id].as_mut().unwrap();