Skip to content

Commit db1f5a8

Browse files
committed
refactor: ASCII fast path + build polish
1 parent 819878c commit db1f5a8

7 files changed

Lines changed: 208 additions & 29 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ If you experience network issues when downloading dependencies, you can customiz
151151
- `ICEBERG_AVRO_GIT_URL`: Apache Avro git repository URL
152152
- `ICEBERG_NANOARROW_URL`: Nanoarrow tarball URL
153153
- `ICEBERG_CROARING_URL`: CRoaring tarball URL
154+
- `ICEBERG_UTF8PROC_URL`: utf8proc tarball URL
154155
- `ICEBERG_NLOHMANN_JSON_URL`: nlohmann-json tarball URL
155156
- `ICEBERG_SPDLOG_URL`: spdlog tarball URL
156157
- `ICEBERG_CPR_URL`: cpr tarball URL

cmake_modules/IcebergThirdpartyToolchain.cmake

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,11 @@ endfunction()
7272
# ICEBERG_AVRO_GIT_URL - Apache Avro git repository URL
7373
# ICEBERG_NANOARROW_URL - Nanoarrow tarball URL
7474
# ICEBERG_CROARING_URL - CRoaring tarball URL
75+
# ICEBERG_UTF8PROC_URL - utf8proc tarball URL
7576
# ICEBERG_NLOHMANN_JSON_URL - nlohmann-json tarball URL
7677
# ICEBERG_SPDLOG_URL - spdlog tarball URL
7778
# ICEBERG_CPR_URL - cpr tarball URL
79+
# ICEBERG_SQLPP23_URL - sqlpp23 tarball URL
7880
#
7981
# Example usage:
8082
# export ICEBERG_ARROW_URL="https://your-mirror.com/apache-arrow-24.0.0.tar.gz"
@@ -109,6 +111,20 @@ else()
109111
)
110112
endif()
111113

114+
set(ICEBERG_UTF8PROC_BUILD_VERSION "2.10.0")
115+
set(ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM
116+
"276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e")
117+
118+
if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
119+
set(UTF8PROC_SOURCE_URL "$ENV{ICEBERG_UTF8PROC_URL}")
120+
else()
121+
# Use the release asset (stable bytes, matching subprojects/utf8proc.wrap) rather
122+
# than the auto-generated tag archive, whose contents GitHub does not guarantee.
123+
set(UTF8PROC_SOURCE_URL
124+
"https://github.com/JuliaStrings/utf8proc/releases/download/v${ICEBERG_UTF8PROC_BUILD_VERSION}/utf8proc-${ICEBERG_UTF8PROC_BUILD_VERSION}.tar.gz"
125+
)
126+
endif()
127+
112128
# ----------------------------------------------------------------------
113129
# FetchContent
114130

@@ -427,20 +443,19 @@ endfunction()
427443
function(resolve_utf8proc_dependency)
428444
prepare_fetchcontent()
429445

430-
if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
431-
set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
432-
else()
433-
set(UTF8PROC_URL
434-
"https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
435-
endif()
446+
# The vendored build needs no install rules; without this, CMake < 3.28 (where
447+
# FetchContent has no EXCLUDE_FROM_ALL) would install utf8proc's headers and
448+
# pkg-config file into the iceberg install prefix.
449+
set(UTF8PROC_INSTALL OFF)
436450

437451
fetchcontent_declare(utf8proc
438452
${FC_DECLARE_COMMON_OPTIONS}
439-
URL ${UTF8PROC_URL}
440-
FIND_PACKAGE_ARGS
441-
NAMES
442-
utf8proc
443-
CONFIG)
453+
URL ${UTF8PROC_SOURCE_URL}
454+
URL_HASH "SHA256=${ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM}"
455+
FIND_PACKAGE_ARGS
456+
NAMES
457+
utf8proc
458+
CONFIG)
444459
fetchcontent_makeavailable(utf8proc)
445460

446461
if(utf8proc_SOURCE_DIR)

mkdocs/docs/getting-started.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ If you experience network issues when downloading dependencies, you can override
143143
| `ICEBERG_AVRO_GIT_URL` | Apache Avro git repository |
144144
| `ICEBERG_NANOARROW_URL` | Nanoarrow tarball |
145145
| `ICEBERG_CROARING_URL` | CRoaring tarball |
146+
| `ICEBERG_UTF8PROC_URL` | utf8proc tarball |
146147
| `ICEBERG_NLOHMANN_JSON_URL` | nlohmann-json tarball |
147148
| `ICEBERG_CPR_URL` | cpr tarball |
148149

src/iceberg/meson.build

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,15 +189,26 @@ croaring_dep = dependency('croaring', static: croaring_needs_static)
189189
nanoarrow_dep = dependency('nanoarrow')
190190
nlohmann_json_dep = dependency('nlohmann_json')
191191
spdlog_dep = dependency('spdlog')
192+
# utf8proc's header declares its functions __declspec(dllimport) on Windows unless
193+
# UTF8PROC_STATIC is defined, and the wrap does not propagate that define to consumers.
194+
# Define it whenever utf8proc is linked statically, so the header's declarations match
195+
# how it is linked. Harmless on other platforms.
196+
utf8proc_needs_static = get_option('default_library') == 'static'
197+
utf8proc_dep = dependency('libutf8proc', static: utf8proc_needs_static)
198+
if utf8proc_needs_static
199+
utf8proc_dep = declare_dependency(
200+
compile_args: ['-DUTF8PROC_STATIC'],
201+
dependencies: utf8proc_dep,
202+
)
203+
endif
192204
zlib_dep = dependency('zlib')
193-
utf8proc_dep = dependency('libutf8proc')
194205

195206
iceberg_deps = [
196207
nanoarrow_dep,
197208
nlohmann_json_dep,
198209
spdlog_dep,
199-
zlib_dep,
200210
utf8proc_dep,
211+
zlib_dep,
201212
]
202213

203214
iceberg_lib = library(

src/iceberg/test/string_util_test.cc

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,31 @@ TEST(StringUtilsTest, ToLowerUnicode) {
5757
// "日本語" has no case mapping and is returned verbatim.
5858
ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
5959
"\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
60-
// Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
60+
// ASCII prefix before the first non-ASCII byte takes the fast path; the rest goes
61+
// through utf8proc. "ABÉ" -> "abé".
62+
ASSERT_EQ(StringUtils::ToLower("AB\xC3\x89"), "ab\xC3\xA9");
63+
// An invalid UTF-8 byte (a lone 0xFF) passes through unchanged rather than erroring.
6164
ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
65+
// An invalid byte only passes through itself; the valid code points around it are
66+
// still lower-cased ("AB" 0xFF "CÉ" -> "ab" 0xFF "cé").
67+
ASSERT_EQ(StringUtils::ToLower("AB\xFF"
68+
"C\xC3\x89"),
69+
"ab\xFF"
70+
"c\xC3\xA9");
71+
// The invalid byte can abut a multi-byte code point with no ASCII between them; 0xFF
72+
// passes through and the adjacent "É" still lower-cases to "é" (0xFF "É" -> 0xFF "é").
73+
ASSERT_EQ(StringUtils::ToLower("\xFF\xC3\x89"), "\xFF\xC3\xA9");
74+
// A truncated multi-byte sequence (0xC3 with no continuation byte) passes through
75+
// without consuming the bytes after it.
76+
ASSERT_EQ(StringUtils::ToLower("\xC3"
77+
"AB"),
78+
"\xC3"
79+
"ab");
80+
// A stray continuation byte (0x80) behaves the same way.
81+
ASSERT_EQ(StringUtils::ToLower("A\x80"
82+
"B"),
83+
"a\x80"
84+
"b");
6285
}
6386

6487
// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
@@ -84,6 +107,23 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
84107
"e"));
85108
// Different letters still differ ("café" vs "cafe").
86109
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
110+
// Fallback correctness: an ASCII operand can equal a non-ASCII one once lower-cased,
111+
// even though their raw byte lengths differ. "İ" (U+0130 = 0xC4 0xB0, two bytes)
112+
// lower-cases to one-byte "i", so it must compare equal to "i" and "I".
113+
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("i", "\xC4\xB0"));
114+
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("\xC4\xB0", "I"));
115+
// The non-ASCII byte can appear after a matching ASCII prefix ("abi" vs "abİ").
116+
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("abi", "ab\xC4\xB0"));
117+
// Pure-ASCII operands that share a prefix but differ in length are not equal.
118+
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "ab"));
119+
// Operands containing invalid UTF-8 are still compared case-insensitively on their
120+
// valid parts; the invalid bytes themselves compare verbatim.
121+
ASSERT_TRUE(
122+
StringUtils::EqualsIgnoreCase("AB\xFF"
123+
"C",
124+
"ab\xFF"
125+
"c"));
126+
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("\xFF", "\xFE"));
87127
}
88128

89129
TEST(StringUtilsTest, StartsWithIgnoreCase) {
@@ -105,6 +145,61 @@ TEST(StringUtilsTest, StartsWithIgnoreCase) {
105145
StringUtils::StartsWithIgnoreCase("CAF\xC3\x89"
106146
"bar",
107147
"caf\xC3\xA9"));
148+
// Invalid UTF-8 bytes compare verbatim in the prefix as well.
149+
ASSERT_TRUE(
150+
StringUtils::StartsWithIgnoreCase("AB\xFF"
151+
"x",
152+
"ab\xFF"));
153+
ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab\xFE", "ab\xFF"));
154+
}
155+
156+
// The ASCII fast paths in EqualsIgnoreCase / StartsWithIgnoreCase must agree with their
157+
// documented ToLower-based semantics for every input, including length-changing case
158+
// mappings and invalid UTF-8. Rather than enumerate cases by hand, exhaustively compare
159+
// both functions against the ToLower oracle over all short strings built from a small
160+
// alphabet that straddles those boundaries. This is the mechanical form of the #760
161+
// regression, where a fast path disagreed with ToLower on a length-changing mapping.
162+
TEST(StringUtilsTest, IgnoreCaseAgreesWithToLowerOracle) {
163+
// Atoms mix ASCII (upper/lower, including the lowercase targets of the multi-byte
164+
// mappings) with a 2-byte code point that lower-cases to one byte ("İ" U+0130 -> "i"),
165+
// a 3-byte one that also shrinks to one byte ("K" U+212A -> "k"), an ordinary 2-byte
166+
// cased letter ("É"), and an invalid UTF-8 byte.
167+
const std::vector<std::string> atoms = {
168+
"a", "I", "i", "k", "\xC4\xB0", "\xE2\x84\xAA", "\xC3\x89", "\xFF"};
169+
170+
// Build every string of 0..3 atoms, one generation (length) at a time.
171+
std::vector<std::string> inputs = {""};
172+
size_t generation_begin = 0;
173+
for (int len = 0; len < 3; ++len) {
174+
const size_t generation_end = inputs.size();
175+
for (size_t i = generation_begin; i < generation_end; ++i) {
176+
for (const auto& atom : atoms) {
177+
inputs.push_back(inputs[i] + atom);
178+
}
179+
}
180+
generation_begin = generation_end;
181+
}
182+
183+
// Precompute the oracle so the O(n^2) comparison below does not re-lower each string.
184+
std::vector<std::string> lowered;
185+
lowered.reserve(inputs.size());
186+
for (const auto& s : inputs) {
187+
lowered.push_back(StringUtils::ToLower(s));
188+
}
189+
190+
for (size_t i = 0; i < inputs.size(); ++i) {
191+
for (size_t j = 0; j < inputs.size(); ++j) {
192+
EXPECT_EQ(StringUtils::EqualsIgnoreCase(inputs[i], inputs[j]),
193+
lowered[i] == lowered[j])
194+
<< "EqualsIgnoreCase disagreed for a=" << testing::PrintToString(inputs[i])
195+
<< " b=" << testing::PrintToString(inputs[j]);
196+
EXPECT_EQ(StringUtils::StartsWithIgnoreCase(inputs[i], inputs[j]),
197+
lowered[i].starts_with(lowered[j]))
198+
<< "StartsWithIgnoreCase disagreed for str="
199+
<< testing::PrintToString(inputs[i])
200+
<< " prefix=" << testing::PrintToString(inputs[j]);
201+
}
202+
}
108203
}
109204

110205
} // namespace iceberg

src/iceberg/util/string_util.cc

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,32 @@ std::string StringUtils::ToLower(std::string_view str) {
3131
std::string result;
3232
result.reserve(str.size());
3333

34+
// Lower-case ASCII bytes directly; hand non-ASCII bytes to utf8proc. The common inputs
35+
// (modes, UUIDs, header/property names, enum-like strings) are pure ASCII and never
36+
// touch utf8proc. utf8proc has no string-level helper, so each non-ASCII code point is
37+
// decoded, mapped with utf8proc_tolower (simple 1:1 mapping, not casefolding), and
38+
// re-encoded.
3439
const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
3540
const auto size = static_cast<utf8proc_ssize_t>(str.size());
3641
utf8proc_ssize_t offset = 0;
3742
while (offset < size) {
43+
// An ASCII byte is a complete 1-byte code point (never a UTF-8 continuation byte),
44+
// and utf8proc_tolower agrees with ToLowerAscii on it, so handle it without utf8proc.
45+
if (IsAsciiByte(str[offset])) {
46+
result.push_back(ToLowerAscii(str[offset]));
47+
++offset;
48+
continue;
49+
}
3850
utf8proc_int32_t code_point = 0;
3951
utf8proc_ssize_t consumed =
4052
utf8proc_iterate(data + offset, size - offset, &code_point);
4153
if (consumed < 0) {
42-
// Invalid UTF-8: return the input unchanged rather than erroring.
43-
return std::string(str);
54+
// Invalid UTF-8: pass the offending byte through unchanged and resume decoding at
55+
// the next byte, so the valid code points around it are still lower-cased.
56+
result.push_back(str[offset]);
57+
++offset;
58+
continue;
4459
}
45-
// utf8proc has no string-level lower-case helper, so map and re-encode each code
46-
// point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
4760
const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
4861
std::array<utf8proc_uint8_t, 4> encoded{};
4962
const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());

src/iceberg/util/string_util.h

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <algorithm>
2323
#include <cerrno>
2424
#include <charconv>
25+
#include <optional>
2526
#include <ranges>
2627
#include <string>
2728
#include <string_view>
@@ -49,7 +50,11 @@ class ICEBERG_EXPORT StringUtils {
4950
/// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above)
5051
/// in Java. For ASCII and the large majority of letters the two agree.
5152
///
52-
/// Invalid UTF-8 input is returned unchanged.
53+
/// Pure-ASCII input takes a byte-wise fast path; utf8proc is only invoked when a
54+
/// non-ASCII byte (>= 0x80) is present. The function is total: it never fails, and
55+
/// input need not be valid UTF-8. A byte that does not begin a valid UTF-8 sequence
56+
/// is copied through unchanged and decoding resumes at the next byte, so the valid
57+
/// code points around it are still lower-cased.
5358
/// See https://github.com/apache/iceberg-cpp/issues/613.
5459
static std::string ToLower(std::string_view str);
5560

@@ -66,19 +71,30 @@ class ICEBERG_EXPORT StringUtils {
6671
std::ranges::to<std::string>();
6772
}
6873

69-
/// \brief Case-insensitive equality; compares the ToLower forms of both operands and
70-
/// therefore inherits ToLower's Unicode simple-mapping behavior.
74+
/// \brief Case-insensitive equality using Unicode simple (1:1) case mapping.
75+
///
76+
/// Equal when the ToLower forms of both operands are equal, so folding follows
77+
/// ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte sequence:
78+
/// ToLower passes invalid UTF-8 bytes through unchanged, so they compare verbatim.
7179
static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
72-
return ToLower(lhs) == ToLower(rhs);
80+
const std::optional<bool> fast = AsciiEqualsIgnoreCase(lhs, rhs);
81+
return fast.has_value() ? *fast : (ToLower(lhs) == ToLower(rhs));
7382
}
7483

75-
/// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs.
84+
/// \brief Case-insensitive prefix test using Unicode simple (1:1) case mapping.
7685
///
77-
/// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are
78-
/// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can
79-
/// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"),
80-
/// so a byte slice could split a code point or reject a valid match.
86+
/// True when the ToLower form of str starts with the ToLower form of prefix, so folding
87+
/// follows ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte
88+
/// sequence: ToLower passes invalid UTF-8 bytes through unchanged, so they compare
89+
/// verbatim.
8190
static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
91+
if (prefix.size() <= str.size()) {
92+
const std::optional<bool> fast =
93+
AsciiEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
94+
if (fast.has_value()) {
95+
return *fast;
96+
}
97+
}
8298
return ToLower(str).starts_with(ToLower(prefix));
8399
}
84100

@@ -150,11 +166,38 @@ class ICEBERG_EXPORT StringUtils {
150166
}
151167

152168
private:
153-
// Avoids std::toupper, which is locale-dependent and has undefined behavior for
154-
// negative char values.
169+
// ASCII-only case mappings. These avoid std::toupper/std::tolower, which are
170+
// locale-dependent and have undefined behavior for negative char values.
155171
static constexpr char ToUpperAscii(char c) noexcept {
156172
return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
157173
}
174+
static constexpr char ToLowerAscii(char c) noexcept {
175+
return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
176+
}
177+
178+
// True if c is a 7-bit ASCII byte (< 0x80). The cast is required because char may be
179+
// signed, which would make bytes >= 0x80 compare as negative.
180+
static constexpr bool IsAsciiByte(char c) noexcept {
181+
return (static_cast<unsigned char>(c) & 0x80) == 0;
182+
}
183+
184+
// Case-insensitive equality decided in a single byte-wise pass, without allocating.
185+
// Returns nullopt once a byte of either operand is non-ASCII, because folding can then
186+
// be non-ASCII and length-changing (e.g. "İ" (U+0130) -> "i"), which only ToLower
187+
// knows.
188+
static std::optional<bool> AsciiEqualsIgnoreCase(std::string_view a,
189+
std::string_view b) {
190+
const size_t n = std::min(a.size(), b.size());
191+
for (size_t i = 0; i < n; ++i) {
192+
if (!IsAsciiByte(a[i]) || !IsAsciiByte(b[i])) {
193+
return std::nullopt;
194+
}
195+
if (ToLowerAscii(a[i]) != ToLowerAscii(b[i])) {
196+
return false;
197+
}
198+
}
199+
return a.size() == b.size();
200+
}
158201
};
159202

160203
/// \brief Transparent hash function that supports std::string_view as lookup key

0 commit comments

Comments
 (0)