Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions native/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions native/shuffle/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ publish = false
arrow = { workspace = true }
async-trait = { workspace = true }
bytes = { workspace = true }
crc32c = "0.6.8"
crc32fast = "1.3.2"
datafusion = { workspace = true }
datafusion-comet-common = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions native/shuffle/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub mod ipc;
pub(crate) mod metrics;
pub(crate) mod partitioners;
mod shuffle_writer;
mod spark_crc32c_hasher;
pub mod spark_unsafe;
pub(crate) mod writers;

Expand Down
55 changes: 55 additions & 0 deletions native/shuffle/src/spark_crc32c_hasher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//! Provide a CRC-32C implementor of [Hasher].
use std::hash::Hasher;

use crc32c::crc32c_append;

/// Implementor of [Hasher] for CRC-32C.
///
/// Note that CRC-32C produces a 32-bit hash (as [u32]),
/// but the trait requires that the output value be [u64].
///
/// This implementation is necessary because the existing [Hasher] implementation does not support
/// [Clone].
#[derive(Default, Clone)]
pub struct SparkCrc32cHasher {
checksum: u32,
}

impl SparkCrc32cHasher {
/// Create the [Hasher] pre-loaded with a particular checksum.
///
/// Use the [Default::default()] constructor for a clean start.
pub fn new(initial: u32) -> Self {
Self { checksum: initial }
}

pub fn finalize(&self) -> u32 {
self.checksum
}
}

impl Hasher for SparkCrc32cHasher {
fn finish(&self) -> u64 {
self.checksum as u64
}

fn write(&mut self, bytes: &[u8]) {
self.checksum = crc32c_append(self.checksum, bytes);
}
}

#[cfg(test)]
mod tests {
use super::*;

const TEST_STRING: &[u8] =
b"This is a very long string which is used to test the CRC-32-Castagnoli function.";
const CHECKSUM: u32 = 0x20_CB_1E_59;

#[test]
fn can_hash() {
let mut hasher = SparkCrc32cHasher::default();
hasher.write(TEST_STRING);
assert_eq!(hasher.finish(), CHECKSUM as u64);
}
}
78 changes: 74 additions & 4 deletions native/shuffle/src/writers/checksum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,32 @@
// under the License.

use bytes::Buf;
use crc32fast::Hasher;
use datafusion_comet_jni_bridge::errors::{CometError, CometResult};
use simd_adler32::Adler32;
use std::hash::Hasher;
use std::io::{Cursor, SeekFrom};
use std::default::Default;
use crate::spark_crc32c_hasher::SparkCrc32cHasher;

/// Checksum algorithms for writing IPC bytes.
#[derive(Clone)]
pub(crate) enum Checksum {
/// CRC32 checksum algorithm.
CRC32(Hasher),
CRC32(crc32fast::Hasher),
/// Adler32 checksum algorithm.
Adler32(Adler32),
/// CRC32C checksum algorithm.
CRC32C(SparkCrc32cHasher),
}

impl Checksum {
pub(crate) fn try_new(algo: i32, initial_opt: Option<u32>) -> CometResult<Self> {
match algo {
0 => {
let hasher = if let Some(initial) = initial_opt {
Hasher::new_with_initial(initial)
crc32fast::Hasher::new_with_initial(initial)
} else {
Hasher::new()
crc32fast::Hasher::new()
};
Ok(Checksum::CRC32(hasher))
}
Expand All @@ -51,6 +55,14 @@ impl Checksum {
};
Ok(Checksum::Adler32(hasher))
}
2 => {
let hasher = if let Some(initial) = initial_opt {
SparkCrc32cHasher::new(initial)
} else {
Default::default()
};
Ok(Checksum::CRC32C(hasher))
}
_ => Err(CometError::Internal(
"Unsupported checksum algorithm".to_string(),
)),
Expand All @@ -69,13 +81,71 @@ impl Checksum {
hasher.write(cursor.chunk());
Ok(())
}
Checksum::CRC32C(hasher) => {
std::io::Seek::seek(cursor, SeekFrom::Start(0))?;
hasher.write(cursor.chunk());
Ok(())
}
}
}

pub(crate) fn finalize(self) -> u32 {
match self {
Checksum::CRC32(hasher) => hasher.finalize(),
Checksum::Adler32(hasher) => hasher.finish(),
Checksum::CRC32C(hasher) => hasher.finalize(),
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;

#[test]
fn test_crc32() {
let mut checksum = Checksum::try_new(0, None).unwrap();
let message = b"123456789";

let mut vector: Vec<u8> = message.to_vec();
let mut buff = Cursor::new(&mut vector);

checksum.update(&mut buff).unwrap();
let result = checksum.finalize();

let expected_crc = 0xcbf43926u32;
assert_eq!(result, expected_crc)
}

#[test]
fn test_adler32() {
let mut checksum = Checksum::try_new(1, None).unwrap();
let message = b"123456789";

let mut vector: Vec<u8> = message.to_vec();
let mut buff = Cursor::new(&mut vector);

checksum.update(&mut buff).unwrap();
let result = checksum.finalize();

let expected_crc = 0x091e01deu32;
assert_eq!(result, expected_crc)
}

#[test]
fn test_crc32c() {
let mut checksum = Checksum::try_new(2, None).unwrap();
let message = b"123456789";

let mut vector: Vec<u8> = message.to_vec();
let mut buff = Cursor::new(&mut vector);

checksum.update(&mut buff).unwrap();
let result = checksum.finalize();

let expected_crc = 0xe3069283u32;
assert_eq!(result, expected_crc)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public abstract class SpillWriter {

protected byte[][] dataTypes;

// 0: CRC32, 1: Adler32. Spark uses Adler32 by default.
// 0: CRC32, 1: Adler32, or 2: CRC32C. Spark uses Adler32 by default.
protected int checksumAlgo = 1;
protected long checksum = -1;

Expand Down Expand Up @@ -98,6 +98,8 @@ protected void setChecksumAlgo(String checksumAlgo) {
this.checksumAlgo = 0;
} else if (algo.equals("adler32")) {
this.checksumAlgo = 1;
} else if (algo.equals("crc32c")) {
this.checksumAlgo = 2;
} else {
throw new UnsupportedOperationException(
"Unsupported shuffle checksum algorithm: " + checksumAlgo);
Expand Down