From c1e771321ed6465125d8c97a9b7cae35738381a8 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 2 Jun 2026 21:02:50 +0000 Subject: [PATCH 1/5] DX-116527 Update CHR to work with unicode --- cpp/src/gandiva/precompiled/string_ops.cc | 62 +++++++++---- .../gandiva/precompiled/string_ops_test.cc | 93 ++++++++++++++----- 2 files changed, 115 insertions(+), 40 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 035d3c8c62e1..61187ec471bc 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1387,29 +1387,27 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) { return static_cast(static_cast(data[0])); } -// Returns the ASCII character having the binary equivalent to A. -// If A is larger than 256 the result is equivalent to chr(A % 256). +// Returns the UTF-8 encoding of the Unicode code point A. +// Raises an error if A is not a valid code point, i.e. it is negative, greater +// than 0x10FFFF, or falls within the UTF-16 surrogate range 0xD800-0xDFFF. FORCE_INLINE -const char* chr_int32(gdv_int64 context, gdv_int32 in, gdv_int32* out_len) { - in = in % 256; - *out_len = 1; - - char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); - if (ret == nullptr) { - gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); +const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) { + if (in < 0 || in > 0x10FFFF || (in >= 0xD800 && in <= 0xDFFF)) { + gdv_fn_context_set_error_msg( + context, "Input is not a valid Unicode code point in the range 0 to 0x10FFFF"); *out_len = 0; return ""; } - ret[0] = char(in); - return ret; -} -// Returns the ASCII character having the binary equivalent to A. -// If A is larger than 256 the result is equivalent to chr(A % 256). -FORCE_INLINE -const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) { - in = in % 256; - *out_len = 1; + if (in <= 0x7F) { + *out_len = 1; + } else if (in <= 0x7FF) { + *out_len = 2; + } else if (in <= 0xFFFF) { + *out_len = 3; + } else { + *out_len = 4; + } char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); if (ret == nullptr) { @@ -1417,10 +1415,36 @@ const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) { *out_len = 0; return ""; } - ret[0] = char(in); + + switch (*out_len) { + case 1: + ret[0] = static_cast(in); + break; + case 2: + ret[0] = static_cast(0xC0 | (in >> 6)); + ret[1] = static_cast(0x80 | (in & 0x3F)); + break; + case 3: + ret[0] = static_cast(0xE0 | (in >> 12)); + ret[1] = static_cast(0x80 | ((in >> 6) & 0x3F)); + ret[2] = static_cast(0x80 | (in & 0x3F)); + break; + case 4: + ret[0] = static_cast(0xF0 | (in >> 18)); + ret[1] = static_cast(0x80 | ((in >> 12) & 0x3F)); + ret[2] = static_cast(0x80 | ((in >> 6) & 0x3F)); + ret[3] = static_cast(0x80 | (in & 0x3F)); + break; + } return ret; } +// Returns the UTF-8 encoding of the Unicode code point A. See chr_int64. +FORCE_INLINE +const char* chr_int32(gdv_int64 context, gdv_int32 in, gdv_int32* out_len) { + return chr_int64(context, in, out_len); +} + FORCE_INLINE const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_int32 len, gdv_int32* out_len) { diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index d57eb437530c..fe8b375c4070 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -56,11 +56,12 @@ TEST(TestStringOps, TestAscii) { } TEST(TestStringOps, TestChrBigInt) { - // CHR + // CHR returns the UTF-8 encoding of the given Unicode code point. gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); int32_t out_len = 0; + // 1-byte ASCII code points. auto out = chr_int32(ctx_ptr, 88, &out_len); EXPECT_EQ(std::string(out, out_len), "X"); @@ -70,26 +71,11 @@ TEST(TestStringOps, TestChrBigInt) { out = chr_int32(ctx_ptr, 49, &out_len); EXPECT_EQ(std::string(out, out_len), "1"); - out = chr_int64(ctx_ptr, 84, &out_len); - EXPECT_EQ(std::string(out, out_len), "T"); - - out = chr_int32(ctx_ptr, 340, &out_len); - EXPECT_EQ(std::string(out, out_len), "T"); - - out = chr_int64(ctx_ptr, 256, &out_len); - EXPECT_EQ(std::strcmp(out, "\0"), 0); - out = chr_int32(ctx_ptr, 33, &out_len); EXPECT_EQ(std::string(out, out_len), "!"); - out = chr_int64(ctx_ptr, 46, &out_len); - EXPECT_EQ(std::string(out, out_len), "."); - - out = chr_int32(ctx_ptr, 63, &out_len); - EXPECT_EQ(std::string(out, out_len), "?"); - out = chr_int64(ctx_ptr, 0, &out_len); - EXPECT_EQ(std::strcmp(out, "\0"), 0); + EXPECT_EQ(std::string(out, out_len), std::string("\0", 1)); out = chr_int32(ctx_ptr, -158, &out_len); EXPECT_EQ(std::string(out, out_len), "b"); @@ -119,13 +105,78 @@ TEST(TestStringOps, TestChrBigInt) { out = chr_int64(ctx_ptr, 8, &out_len); EXPECT_EQ(std::string(out, out_len), "\b"); - // DEVICE CONTROL 3 (DC3) - out = chr_int32(ctx_ptr, 19, &out_len); - EXPECT_EQ(std::string(out, out_len), "\x13"); - // ESCAPE (ESC) out = chr_int64(ctx_ptr, 27, &out_len); EXPECT_EQ(std::string(out, out_len), "\x1B"); + + // Highest 1-byte code point (U+007F, DELETE). + out = chr_int32(ctx_ptr, 0x7F, &out_len); + EXPECT_EQ(std::string(out, out_len), "\x7F"); + + // 2-byte code points. + // Lowest 2-byte code point (U+0080). + out = chr_int32(ctx_ptr, 0x80, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xC2\x80"); + + // Γ­ (U+00ED) + out = chr_int32(ctx_ptr, 237, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xC3\xAD"); + + // Highest 2-byte code point (U+07FF). + out = chr_int64(ctx_ptr, 0x7FF, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xDF\xBF"); + + // 3-byte code points. + // Lowest 3-byte code point (U+0800). + out = chr_int32(ctx_ptr, 0x800, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xE0\xA0\x80"); + + // € (U+20AC) + out = chr_int32(ctx_ptr, 8364, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xE2\x82\xAC"); + + // ζ—₯ (U+65E5) + out = chr_int64(ctx_ptr, 26085, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xE6\x97\xA5"); + + // Highest 3-byte code point (U+FFFF). + out = chr_int32(ctx_ptr, 0xFFFF, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xEF\xBF\xBF"); + + // 4-byte code points. + // Lowest 4-byte code point (U+10000). + out = chr_int64(ctx_ptr, 0x10000, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xF0\x90\x80\x80"); + + // πŸ˜€ (U+1F600) + out = chr_int64(ctx_ptr, 0x1F600, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xF0\x9F\x98\x80"); + + // Highest valid code point (U+10FFFF). + out = chr_int64(ctx_ptr, 0x10FFFF, &out_len); + EXPECT_EQ(std::string(out, out_len), "\xF4\x8F\xBF\xBF"); + + EXPECT_FALSE(ctx.has_error()); + + // Invalid code points raise an error. + chr_int64(ctx_ptr, -1, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos) + << ctx.get_error(); + ctx.Reset(); + + chr_int64(ctx_ptr, 0x110000, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos) + << ctx.get_error(); + ctx.Reset(); + + // UTF-16 surrogate range is not a valid code point. + chr_int32(ctx_ptr, 0xD800, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos) + << ctx.get_error(); + ctx.Reset(); } TEST(TestStringOps, TestBeginsEnds) { From 2e928dceab9ff7601f2aabb6e109618441b6723d Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 2 Jun 2026 22:02:59 +0000 Subject: [PATCH 2/5] Update error message for readability. --- cpp/src/gandiva/precompiled/string_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 61187ec471bc..cc3e82a7e6e5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1394,7 +1394,7 @@ FORCE_INLINE const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) { if (in < 0 || in > 0x10FFFF || (in >= 0xD800 && in <= 0xDFFF)) { gdv_fn_context_set_error_msg( - context, "Input is not a valid Unicode code point in the range 0 to 0x10FFFF"); + context, "Input is not a valid Unicode code point."); *out_len = 0; return ""; } From a8a6713b81ea820b6a18aeded25b50162511679a Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 9 Jun 2026 18:54:37 +0000 Subject: [PATCH 3/5] Update test --- .../gandiva/precompiled/string_ops_test.cc | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index fe8b375c4070..2e64a0dd7469 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -77,30 +77,6 @@ TEST(TestStringOps, TestChrBigInt) { out = chr_int64(ctx_ptr, 0, &out_len); EXPECT_EQ(std::string(out, out_len), std::string("\0", 1)); - out = chr_int32(ctx_ptr, -158, &out_len); - EXPECT_EQ(std::string(out, out_len), "b"); - - out = chr_int64(ctx_ptr, -5, &out_len); - EXPECT_EQ(std::string(out, out_len), "\xFB"); - - out = chr_int32(ctx_ptr, -340, &out_len); - EXPECT_EQ(std::string(out, out_len), "\xAC"); - - out = chr_int64(ctx_ptr, -66, &out_len); - EXPECT_EQ(std::string(out, out_len), "\xBE"); - - // € - out = chr_int32(ctx_ptr, 128, &out_len); - EXPECT_EQ(std::string(out, out_len), "\x80"); - - // Ε“ - out = chr_int64(ctx_ptr, 156, &out_len); - EXPECT_EQ(std::string(out, out_len), "\x9C"); - - // ΓΏ - out = chr_int32(ctx_ptr, 255, &out_len); - EXPECT_EQ(std::string(out, out_len), "\xFF"); - // BACKSPACE out = chr_int64(ctx_ptr, 8, &out_len); EXPECT_EQ(std::string(out, out_len), "\b"); From c6b6611ae286cac816582f1f4e84ce223d169568 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 9 Jun 2026 19:17:55 +0000 Subject: [PATCH 4/5] Test updates and error message --- cpp/src/gandiva/precompiled/string_ops.cc | 9 +++- .../gandiva/precompiled/string_ops_test.cc | 3 +- cpp/src/gandiva/tests/projector_test.cc | 44 ++++++++++++++++--- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index cc3e82a7e6e5..79f4b6a5c4d4 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1393,8 +1393,13 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) { FORCE_INLINE const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) { if (in < 0 || in > 0x10FFFF || (in >= 0xD800 && in <= 0xDFFF)) { - gdv_fn_context_set_error_msg( - context, "Input is not a valid Unicode code point."); + const char* fmt = + "Input %lld is not a valid Unicode code point in the range 0 to 1114111"; + int size = static_cast(strlen(fmt)) + 32; + char* error = reinterpret_cast(malloc(size)); + snprintf(error, size, fmt, static_cast(in)); + gdv_fn_context_set_error_msg(context, error); + free(error); *out_len = 0; return ""; } diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 2e64a0dd7469..cec946d37c31 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -134,11 +134,12 @@ TEST(TestStringOps, TestChrBigInt) { EXPECT_FALSE(ctx.has_error()); - // Invalid code points raise an error. + // Invalid code points raise an error that includes the offending value. chr_int64(ctx_ptr, -1, &out_len); EXPECT_EQ(out_len, 0); EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos) << ctx.get_error(); + EXPECT_TRUE(ctx.get_error().find("-1") != std::string::npos) << ctx.get_error(); ctx.Reset(); chr_int64(ctx_ptr, 0x110000, &out_len); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 268cb55a6422..aba3d80c9f15 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1317,13 +1317,16 @@ TEST_F(TestProjector, TestChr) { auto status = Projector::Make(schema, {chr_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); - // Create a row-batch with some sample data + // Create a row-batch with code points spanning 1- to 4-byte UTF-8 encodings. + // 65 -> "A", 237 -> "Γ­" (U+00ED), 8364 -> "€" (U+20AC), 26085 -> "ζ—₯" (U+65E5), + // 128512 -> "πŸ˜€" (U+1F600). int num_records = 5; - auto array0 = - MakeArrowArrayInt64({65, 84, 255, 340, -5}, {true, true, true, true, true}); - // expected output + auto array0 = MakeArrowArrayInt64({65, 237, 8364, 26085, 128512}, + {true, true, true, true, true}); + // expected UTF-8 output auto exp_chr = - MakeArrowArrayUtf8({"A", "T", "\xFF", "T", "\xFB"}, {true, true, true, true, true}); + MakeArrowArrayUtf8({"A", "\xC3\xAD", "\xE2\x82\xAC", "\xE6\x97\xA5", "\xF0\x9F\x98\x80"}, + {true, true, true, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0}); @@ -1337,6 +1340,37 @@ TEST_F(TestProjector, TestChr) { EXPECT_ARROW_ARRAY_EQUALS(exp_chr, outputs.at(0)); } +TEST_F(TestProjector, TestChrInvalidCodePoint) { + // schema for input fields + auto field0 = field("f0", int64()); + auto schema = arrow::schema({field0}); + + // output fields + auto field_chr = field("chr", arrow::utf8()); + + // Build expression + auto chr_expr = TreeExprBuilder::MakeExpression("chr", {field0}, field_chr); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {chr_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // A code point outside the valid Unicode range (here, a negative value) must + // fail evaluation rather than wrap around. + int num_records = 1; + auto array0 = MakeArrowArrayInt64({-5}, {true}); + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.message().find("not a valid Unicode code point"), std::string::npos) + << status.message(); + // The message should include the offending value for debuggability. + EXPECT_NE(status.message().find("-5"), std::string::npos) << status.message(); +} + TEST_F(TestProjector, TestBase64) { // schema for input fields auto field0 = field("f0", arrow::binary()); From b0435db2bc9945a055ddb481c8ae737503e413f1 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 11 Jun 2026 23:28:59 +0000 Subject: [PATCH 5/5] Lint --- cpp/src/gandiva/tests/projector_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index aba3d80c9f15..fc284160a944 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1321,12 +1321,12 @@ TEST_F(TestProjector, TestChr) { // 65 -> "A", 237 -> "Γ­" (U+00ED), 8364 -> "€" (U+20AC), 26085 -> "ζ—₯" (U+65E5), // 128512 -> "πŸ˜€" (U+1F600). int num_records = 5; - auto array0 = MakeArrowArrayInt64({65, 237, 8364, 26085, 128512}, - {true, true, true, true, true}); + auto array0 = + MakeArrowArrayInt64({65, 237, 8364, 26085, 128512}, {true, true, true, true, true}); // expected UTF-8 output - auto exp_chr = - MakeArrowArrayUtf8({"A", "\xC3\xAD", "\xE2\x82\xAC", "\xE6\x97\xA5", "\xF0\x9F\x98\x80"}, - {true, true, true, true, true}); + auto exp_chr = MakeArrowArrayUtf8( + {"A", "\xC3\xAD", "\xE2\x82\xAC", "\xE6\x97\xA5", "\xF0\x9F\x98\x80"}, + {true, true, true, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});