Skip to content

Commit 75b9af5

Browse files
feat[array]: dict array allows signed codes (#6336)
To support the current `take` compute function allowing signed indices we allow the dict array to also store signed codes`. This is a backwards compatible change. We could add another array take that is only used for compute/? --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent 7057a13 commit 75b9af5

8 files changed

Lines changed: 53 additions & 16 deletions

File tree

vortex-array/src/arrays/dict/arbitrary.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,26 @@ impl ArbitraryDictArray {
4646

4747
// Choose a random PType at least as wide as the minimum
4848
let valid_ptypes: &[PType] = match min_codes_ptype {
49-
PType::U8 => &[PType::U8, PType::U16, PType::U32, PType::U64],
50-
PType::U16 => &[PType::U16, PType::U32, PType::U64],
51-
PType::U32 => &[PType::U32, PType::U64],
52-
PType::U64 => &[PType::U64],
49+
PType::U8 => &[
50+
PType::U8,
51+
PType::U16,
52+
PType::U32,
53+
PType::U64,
54+
PType::I8,
55+
PType::I16,
56+
PType::I32,
57+
PType::I64,
58+
],
59+
PType::U16 => &[
60+
PType::U16,
61+
PType::U32,
62+
PType::U64,
63+
PType::I16,
64+
PType::I32,
65+
PType::I64,
66+
],
67+
PType::U32 => &[PType::U32, PType::U64, PType::I32, PType::I64],
68+
PType::U64 => &[PType::U64, PType::I64],
5369
_ => unreachable!(),
5470
};
5571
let codes_ptype = *u.choose(valid_ptypes)?;
@@ -61,6 +77,10 @@ impl ArbitraryDictArray {
6177
PType::U16 => random_codes::<u16>(u, codes_len, values_len, codes_nullable)?,
6278
PType::U32 => random_codes::<u32>(u, codes_len, values_len, codes_nullable)?,
6379
PType::U64 => random_codes::<u64>(u, codes_len, values_len, codes_nullable)?,
80+
PType::I8 => random_codes::<i8>(u, codes_len, values_len, codes_nullable)?,
81+
PType::I16 => random_codes::<i16>(u, codes_len, values_len, codes_nullable)?,
82+
PType::I32 => random_codes::<i32>(u, codes_len, values_len, codes_nullable)?,
83+
PType::I64 => random_codes::<i64>(u, codes_len, values_len, codes_nullable)?,
6484
_ => unreachable!(),
6585
};
6686

@@ -71,7 +91,7 @@ impl ArbitraryDictArray {
7191
}
7292
}
7393

74-
/// Generate random codes for a DictArray with a specific unsigned integer type.
94+
/// Generate random codes for a DictArray with a specific integer type.
7595
fn random_codes<T>(
7696
u: &mut Unstructured,
7797
len: usize,

vortex-array/src/arrays/dict/array.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,18 @@ impl DictArray {
103103

104104
/// Build a new `DictArray` from its components, `codes` and `values`.
105105
///
106-
/// The codes must be unsigned integers, and may be nullable. Values can be any type, and
107-
/// may also be nullable. This mirrors the nullability of the Arrow `DictionaryArray`.
106+
/// The codes must be integers, and may be nullable. Values can be any
107+
/// type, and may also be nullable. This mirrors the nullability of the Arrow `DictionaryArray`.
108108
///
109109
/// # Errors
110110
///
111-
/// The `codes` **must** be unsigned integers, and the maximum code must be less than the length
111+
/// The `codes` **must** be integers, and the maximum code must be less than the length
112112
/// of the `values` array. Otherwise, this constructor returns an error.
113113
///
114114
/// It is an error to provide a nullable `codes` with non-nullable `values`.
115115
pub fn try_new(codes: ArrayRef, values: ArrayRef) -> VortexResult<Self> {
116-
if !codes.dtype().is_unsigned_int() {
117-
vortex_bail!(MismatchedTypes: "unsigned int", codes.dtype());
116+
if !codes.dtype().is_int() {
117+
vortex_bail!(MismatchedTypes: "int", codes.dtype());
118118
}
119119

120120
Ok(unsafe { Self::new_unchecked(codes, values) })
@@ -194,7 +194,11 @@ impl DictArray {
194194
match codes_validity.bit_buffer() {
195195
AllOr::All => {
196196
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
197-
#[allow(clippy::cast_possible_truncation)]
197+
#[allow(
198+
clippy::cast_possible_truncation,
199+
clippy::cast_sign_loss,
200+
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
201+
)]
198202
for &code in codes_primitive.as_slice::<P>().iter() {
199203
values_vec[code as usize] = referenced_value;
200204
}
@@ -205,7 +209,11 @@ impl DictArray {
205209
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
206210
let codes = codes_primitive.as_slice::<P>();
207211

208-
#[allow(clippy::cast_possible_truncation)]
212+
#[allow(
213+
clippy::cast_possible_truncation,
214+
clippy::cast_sign_loss,
215+
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
216+
)]
209217
buf.set_indices().for_each(|idx| {
210218
values_vec[codes[idx] as usize] = referenced_value;
211219
})

vortex-array/src/builders/dict/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ pub fn dict_encoder(array: &dyn Array, constraints: &DictConstraints) -> Box<dyn
5858
dict_builder
5959
}
6060

61+
/// Encode an array as a `DictArray` subject to the given constraints.
62+
///
63+
/// Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
6164
pub fn dict_encode_with_constraints(
6265
array: &dyn Array,
6366
constraints: &DictConstraints,

vortex-btrblocks/src/compressor/float/dictionary.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
//! Float-specific dictionary encoding implementation.
5+
//!
6+
//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
57
68
use vortex_array::IntoArray;
79
use vortex_array::arrays::DictArray;

vortex-btrblocks/src/compressor/integer/dictionary.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
//! Dictionary compressor that reuses the unique values in the `IntegerStats`.
5+
//!
6+
//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
57
68
use vortex_array::IntoArray;
79
use vortex_array::arrays::DictArray;

vortex-cuda/src/kernel/arrays/dict.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ use vortex_dtype::DType;
2121
use vortex_dtype::NativeDecimalType;
2222
use vortex_dtype::NativePType;
2323
use vortex_dtype::match_each_decimal_value_type;
24+
use vortex_dtype::match_each_integer_ptype;
2425
use vortex_dtype::match_each_native_simd_ptype;
25-
use vortex_dtype::match_each_unsigned_integer_ptype;
2626
use vortex_error::VortexExpect;
2727
use vortex_error::VortexResult;
2828
use vortex_error::vortex_bail;
@@ -75,7 +75,7 @@ async fn execute_dict_prim(dict: DictArray, ctx: &mut CudaExecutionCtx) -> Vorte
7575

7676
// Dispatch based on both value type and code type
7777
match_each_native_simd_ptype!(values_ptype, |V| {
78-
match_each_unsigned_integer_ptype!(codes_ptype, |I| {
78+
match_each_integer_ptype!(codes_ptype, |I| {
7979
execute_dict_prim_typed::<V, I>(values_prim, codes_prim, ctx).await
8080
})
8181
})
@@ -165,7 +165,7 @@ async fn execute_dict_decimal(
165165
let decimal_type = values_decimal.values_type();
166166

167167
match_each_decimal_value_type!(decimal_type, |V| {
168-
match_each_unsigned_integer_ptype!(codes_ptype, |C| {
168+
match_each_integer_ptype!(codes_ptype, |C| {
169169
execute_dict_decimal_typed::<V, C>(values_decimal, codes_prim, dtype, ctx).await
170170
})
171171
})

vortex-layout/src/layouts/dict/reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ impl LayoutReader for DictReader {
223223
let (values, codes) = try_join!(values_eval.map_err(VortexError::from), codes_eval)?;
224224

225225
// SAFETY: Layout was validated at write time.
226-
// * The codes dtype is guaranteed to be an unsigned integer type from the layout
226+
// * The codes dtype is guaranteed to be an integer type from the layout
227227
// * The codes child reader ensures the correct dtype.
228228
// * The layout stores `all_values_referenced` and if this is malicious then it must
229229
// only affect correctness not memory safety.

vortex-layout/src/layouts/dict/writer.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ pub struct DictLayoutConstraints {
6666
/// The codes dtype is determined upfront from this constraint:
6767
/// - [`PType::U8`] when max_len <= 255
6868
/// - [`PType::U16`] when max_len > 255
69+
///
70+
/// Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
6971
pub max_len: u16,
7072
}
7173

0 commit comments

Comments
 (0)