@@ -35,23 +35,6 @@ use rand_distr::{Normal, Pareto};
3535use std:: fmt:: Write ;
3636use std:: sync:: Arc ;
3737
38- /// Payload profile for the benchmark `utf8` column.
39- ///
40- /// The small profile preserves the existing `hi0`..`hi3` baseline. Medium and
41- /// large profiles keep the same low cardinality but scale each value's byte
42- /// width so string aggregation can expose the cost of copying larger payloads.
43- #[ derive( Clone , Copy , Debug ) ]
44- pub enum Utf8PayloadProfile {
45- /// 3-byte baseline values such as `hi0`.
46- Small ,
47- /// 64-byte payloads that are large enough to make copying noticeable
48- /// without dominating the benchmark with allocator churn.
49- Medium ,
50- /// 1024-byte payloads that amplify both CPU and memory pressure in
51- /// grouped `string_agg` workloads.
52- Large ,
53- }
54-
5538/// create an in-memory table given the partition len, array len, and batch size,
5639/// and the result table will be of array_len in total, and then partitioned, and batched.
5740#[ expect( clippy:: allow_attributes) ] // some issue where expect(dead_code) doesn't fire properly
@@ -61,32 +44,9 @@ pub fn create_table_provider(
6144 array_len : usize ,
6245 batch_size : usize ,
6346) -> Result < Arc < MemTable > > {
64- create_table_provider_with_payload (
65- partitions_len,
66- array_len,
67- batch_size,
68- Utf8PayloadProfile :: Small ,
69- )
70- }
71-
72- /// Create an in-memory table with a configurable `utf8` payload size.
73- #[ expect( clippy:: allow_attributes) ] // some issue where expect(dead_code) doesn't fire properly
74- #[ allow( dead_code) ]
75- pub fn create_table_provider_with_payload (
76- partitions_len : usize ,
77- array_len : usize ,
78- batch_size : usize ,
79- utf8_payload_profile : Utf8PayloadProfile ,
80- ) -> Result < Arc < MemTable > > {
81- let _ = Utf8PayloadProfile :: all ( ) ;
8247 let schema = Arc :: new ( create_schema ( ) ) ;
83- let partitions = create_record_batches (
84- & schema,
85- array_len,
86- partitions_len,
87- batch_size,
88- utf8_payload_profile,
89- ) ;
48+ let partitions =
49+ create_record_batches ( & schema, array_len, partitions_len, batch_size) ;
9050 // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
9151 MemTable :: try_new ( schema, partitions) . map ( Arc :: new)
9252}
@@ -131,14 +91,12 @@ fn create_record_batch(
13191 rng : & mut StdRng ,
13292 batch_size : usize ,
13393 batch_index : usize ,
134- payloads : & [ String ; 4 ] ,
13594) -> RecordBatch {
13695 // Randomly choose from 4 distinct key values; a higher number increases sparseness.
13796 let key_suffixes = [ 0 , 1 , 2 , 3 ] ;
138- let keys = StringArray :: from_iter_values ( ( 0 ..batch_size) . map ( |_| {
139- let suffix = * key_suffixes. choose ( rng) . unwrap ( ) ;
140- payloads[ suffix] . as_str ( )
141- } ) ) ;
97+ let keys = StringArray :: from_iter_values (
98+ ( 0 ..batch_size) . map ( |_| format ! ( "hi{}" , key_suffixes. choose( rng) . unwrap( ) ) ) ,
99+ ) ;
142100
143101 let values = create_data ( rng, batch_size, 0.5 ) ;
144102
@@ -188,12 +146,10 @@ pub fn create_record_batches(
188146 array_len : usize ,
189147 partitions_len : usize ,
190148 batch_size : usize ,
191- utf8_payload_profile : Utf8PayloadProfile ,
192149) -> Vec < Vec < RecordBatch > > {
193150 let mut rng = StdRng :: seed_from_u64 ( 42 ) ;
194151 let mut partitions = Vec :: with_capacity ( partitions_len) ;
195152 let batches_per_partition = array_len / batch_size / partitions_len;
196- let payloads = utf8_payload_profile. payloads ( ) ;
197153
198154 for _ in 0 ..partitions_len {
199155 let mut batches = Vec :: with_capacity ( batches_per_partition) ;
@@ -203,37 +159,13 @@ pub fn create_record_batches(
203159 & mut rng,
204160 batch_size,
205161 batch_index,
206- & payloads,
207162 ) ) ;
208163 }
209164 partitions. push ( batches) ;
210165 }
211166 partitions
212167}
213168
214- impl Utf8PayloadProfile {
215- fn all ( ) -> [ Self ; 3 ] {
216- [ Self :: Small , Self :: Medium , Self :: Large ]
217- }
218-
219- fn payloads ( self ) -> [ String ; 4 ] {
220- std:: array:: from_fn ( |idx| match self {
221- Self :: Small => format ! ( "hi{idx}" ) ,
222- Self :: Medium => payload_string ( "mid" , idx, 64 ) ,
223- Self :: Large => payload_string ( "large" , idx, 1024 ) ,
224- } )
225- }
226- }
227-
228- fn payload_string ( prefix : & str , suffix : usize , target_len : usize ) -> String {
229- let mut value = format ! ( "{prefix}{suffix}_" ) ;
230- value. extend ( std:: iter:: repeat_n (
231- ( b'a' + suffix as u8 ) as char ,
232- target_len - value. len ( ) ,
233- ) ) ;
234- value
235- }
236-
237169/// An enum that wraps either a regular StringBuilder or a GenericByteViewBuilder
238170/// so that both can be used interchangeably.
239171enum TraceIdBuilder {
0 commit comments