From f42e2dac535f2ef86788288518f8ffef08bdc429 Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Thu, 18 Jun 2026 21:35:54 -0400 Subject: [PATCH 1/6] feat(string_util): Unicode-aware ToLower via utf8proc Replace the ASCII-only ToLower with utf8proc simple case mapping so case-insensitive name handling matches Iceberg Java's toLowerCase(Locale.ROOT). ToUpper stays ASCII-only since it is not used for name matching. EqualsIgnoreCase now compares lowercased forms. Wire utf8proc into both the CMake (vendored/system) and Meson builds. See https://github.com/apache/iceberg-cpp/issues/613. --- .../IcebergThirdpartyToolchain.cmake | 57 +++++++++++++++++++ src/iceberg/CMakeLists.txt | 8 ++- src/iceberg/meson.build | 9 ++- src/iceberg/test/string_util_test.cc | 45 ++++++++++----- src/iceberg/util/string_util.cc | 31 ++++++++++ src/iceberg/util/string_util.h | 33 +++++------ subprojects/utf8proc.wrap | 30 ++++++++++ 7 files changed, 177 insertions(+), 36 deletions(-) create mode 100644 subprojects/utf8proc.wrap diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 8e10fd8ec..982390332 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -421,6 +421,62 @@ function(resolve_croaring_dependency) PARENT_SCOPE) endfunction() +# ---------------------------------------------------------------------- +# utf8proc + +function(resolve_utf8proc_dependency) + prepare_fetchcontent() + + if(DEFINED ENV{ICEBERG_UTF8PROC_URL}) + set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}") + else() + set(UTF8PROC_URL + "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz") + endif() + + fetchcontent_declare(utf8proc + ${FC_DECLARE_COMMON_OPTIONS} + URL ${UTF8PROC_URL} + FIND_PACKAGE_ARGS + NAMES + utf8proc + CONFIG) + fetchcontent_makeavailable(utf8proc) + + if(utf8proc_SOURCE_DIR) + if(NOT TARGET utf8proc::utf8proc) + add_library(utf8proc::utf8proc INTERFACE IMPORTED) + target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc) + target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR}) + endif() + + set(UTF8PROC_VENDORED TRUE) + # utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which + # install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc + # is a private dependency, so installed consumers never need its headers. + set_target_properties(utf8proc + PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc" + POSITION_INDEPENDENT_CODE ON + INTERFACE_INCLUDE_DIRECTORIES + "$") + install(TARGETS utf8proc + EXPORT iceberg_targets + RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}") + else() + set(UTF8PROC_VENDORED FALSE) + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc) + endif() + + set(ICEBERG_SYSTEM_DEPENDENCIES + ${ICEBERG_SYSTEM_DEPENDENCIES} + PARENT_SCOPE) + set(UTF8PROC_VENDORED + ${UTF8PROC_VENDORED} + PARENT_SCOPE) +endfunction() + # ---------------------------------------------------------------------- # nlohmann-json @@ -719,6 +775,7 @@ endfunction() resolve_zlib_dependency() resolve_nanoarrow_dependency() resolve_croaring_dependency() +resolve_utf8proc_dependency() resolve_nlohmann_json_dependency() resolve_spdlog_dependency() diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 04a9322a1..a14c52729 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -145,23 +145,27 @@ list(APPEND "$,nanoarrow::nanoarrow_static,$,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>" nlohmann_json::nlohmann_json spdlog::spdlog + utf8proc::utf8proc ZLIB::ZLIB) list(APPEND ICEBERG_SHARED_BUILD_INTERFACE_LIBS "$,nanoarrow::nanoarrow_static,$,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>" nlohmann_json::nlohmann_json spdlog::spdlog + utf8proc::utf8proc ZLIB::ZLIB) list(APPEND ICEBERG_STATIC_INSTALL_INTERFACE_LIBS "$,iceberg::nanoarrow_static,$,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>" "$,iceberg::nlohmann_json,$,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>" - "$,iceberg::spdlog,spdlog::spdlog>") + "$,iceberg::spdlog,spdlog::spdlog>" + "$,iceberg::utf8proc,utf8proc::utf8proc>") list(APPEND ICEBERG_SHARED_INSTALL_INTERFACE_LIBS "$,iceberg::nanoarrow_static,$,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>" "$,iceberg::nlohmann_json,$,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>" - "$,iceberg::spdlog,spdlog::spdlog>") + "$,iceberg::spdlog,spdlog::spdlog>" + "$,iceberg::utf8proc,utf8proc::utf8proc>") add_iceberg_lib(iceberg SOURCES diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index f0b103828..f69ce36c0 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow') nlohmann_json_dep = dependency('nlohmann_json') spdlog_dep = dependency('spdlog') zlib_dep = dependency('zlib') +utf8proc_dep = dependency('libutf8proc') -iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep] +iceberg_deps = [ + nanoarrow_dep, + nlohmann_json_dep, + spdlog_dep, + zlib_dep, + utf8proc_dep, +] iceberg_lib = library( 'iceberg', diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index a3fd03760..3d4422f42 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) { ASSERT_EQ(StringUtils::ToUpper("123"), "123"); } -// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored -// in a signed char. Only ASCII letters are converted; multibyte bytes pass through -// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test -// does not depend on the source-file encoding. See -// https://github.com/apache/iceberg-cpp/issues/613. -TEST(StringUtilsTest, NonAsciiPassThrough) { - // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change). - ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve"); - // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged). - ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); - // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim. +// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not +// depend on the source-file encoding. An escape is split before a following hex digit +// (e.g. "...\x9E" "E") so the \x does not absorb it. +// See https://github.com/apache/iceberg-cpp/issues/613. +TEST(StringUtilsTest, ToLowerUnicode) { + // "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9). + ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9"); + // "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss" + // as casefolding would produce. + ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E" + "E"), + "gro\xC3\x9F" + "e"); + // "日本語" has no case mapping and is returned verbatim. ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); + // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring. + ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF"); +} + +// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through. +TEST(StringUtilsTest, ToUpperAsciiOnly) { + // "café" -> "CAFé" (é stays unchanged). + ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); } @@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) { ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", "")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd")); - // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café") - ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9")); - // "café" vs "cafe": the multibyte é differs from ASCII 'e'. + // Unicode-aware: "CAFÉ" matches "café". + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9")); + // "GROẞE" matches "große" under lowercasing (ẞ -> ß). + ASSERT_TRUE( + StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E" + "E", + "gro\xC3\x9F" + "e")); + // Different letters still differ ("café" vs "cafe"). ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); } diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc index 0454a62b5..00b938a07 100644 --- a/src/iceberg/util/string_util.cc +++ b/src/iceberg/util/string_util.cc @@ -19,10 +19,41 @@ #include "iceberg/util/string_util.h" +#include + +#include + #include "iceberg/util/macros.h" namespace iceberg { +std::string StringUtils::ToLower(std::string_view str) { + std::string result; + result.reserve(str.size()); + + const auto* data = reinterpret_cast(str.data()); + const auto size = static_cast(str.size()); + utf8proc_ssize_t offset = 0; + while (offset < size) { + utf8proc_int32_t code_point = 0; + utf8proc_ssize_t consumed = + utf8proc_iterate(data + offset, size - offset, &code_point); + if (consumed < 0) { + // Invalid UTF-8: return the input unchanged rather than erroring. + return std::string(str); + } + // utf8proc has no string-level lower-case helper, so map and re-encode each code + // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding). + const utf8proc_int32_t lowered = utf8proc_tolower(code_point); + std::array encoded{}; + const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data()); + result.append(reinterpret_cast(encoded.data()), + static_cast(written)); + offset += consumed; + } + return result; +} + Result> StringUtils::HexStringToBytes(std::string_view hex) { if (hex.size() % 2 != 0) [[unlikely]] { return InvalidArgument("Hex string must have even length, got: {}", hex.size()); diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 01b6087b8..afb28bf2a 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -20,7 +20,6 @@ #pragma once #include -#include #include #include #include @@ -41,22 +40,24 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: - // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII - // (multibyte UTF-8) bytes, are passed through unchanged. - // See https://github.com/apache/iceberg-cpp/issues/613. - static std::string ToLower(std::string_view str) { - return str | std::ranges::views::transform(ToLowerAscii) | - std::ranges::to(); - } - + /// \brief Lower-case a UTF-8 string using Unicode simple case mapping. + /// + /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with + /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged. + /// See https://github.com/apache/iceberg-cpp/issues/613. + static std::string ToLower(std::string_view str); + + /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through + /// unchanged. + /// + /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching. static std::string ToUpper(std::string_view str) { return str | std::ranges::views::transform(ToUpperAscii) | std::ranges::to(); } static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { - return std::ranges::equal( - lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); }); + return ToLower(lhs) == ToLower(rhs); } static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { @@ -134,14 +135,8 @@ class ICEBERG_EXPORT StringUtils { } private: - // ASCII-only case conversion using explicit range checks rather than - // std::tolower/std::toupper. This is independent of the current C locale and never - // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It - // also sidesteps the undefined behavior of passing a negative char to . - static constexpr char ToLowerAscii(char c) noexcept { - return (c >= 'A' && c <= 'Z') ? static_cast(c - 'A' + 'a') : c; - } - + // Avoids std::toupper, which is locale-dependent and has undefined behavior for + // negative char values. static constexpr char ToUpperAscii(char c) noexcept { return (c >= 'a' && c <= 'z') ? static_cast(c - 'a' + 'A') : c; } diff --git a/subprojects/utf8proc.wrap b/subprojects/utf8proc.wrap new file mode 100644 index 000000000..9b33b3bea --- /dev/null +++ b/subprojects/utf8proc.wrap @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[wrap-file] +directory = utf8proc-2.10.0 +source_url = https://github.com/JuliaStrings/utf8proc/releases/download/v2.10.0/utf8proc-2.10.0.tar.gz +source_filename = utf8proc-2.10.0.tar.gz +source_hash = 276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e +patch_filename = utf8proc_2.10.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/utf8proc_2.10.0-1/get_patch +patch_hash = be16c4514603e922f9636045699fe1a6f844d340b9b7c14b809e47253b06a844 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/utf8proc_2.10.0-1/utf8proc-2.10.0.tar.gz +wrapdb_version = 2.10.0-1 + +[provide] +libutf8proc = utf8proc_dep From b8639d6801e2570a6a3ba9d05e53fd21797218dc Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Thu, 18 Jun 2026 23:51:09 -0400 Subject: [PATCH 2/6] Add license info to LICENSE --- LICENSE | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/LICENSE b/LICENSE index 374b7fc58..8d8d5ff4e 100644 --- a/LICENSE +++ b/LICENSE @@ -228,3 +228,95 @@ Home page: https://arrow.apache.org/ License: https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- + +This product bundles utf8proc, which is available under the MIT License: + +utf8proc is a software package originally developed by Jan Behrens and the rest +of the Public Software Group, now maintained by the Julia-language developers. +All new work on the utf8proc library is licensed under the MIT "expat" license: + +Copyright (c) 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas +Fonseca, and other contributors listed in the git history. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +The original utf8proc is licensed under the same MIT "expat" license: + +Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +utf8proc also contains data derived from the Unicode data files. The following +license applies to that data: + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed under +the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of the Unicode data files and any associated documentation (the "Data +Files") or Unicode software and any associated documentation (the +"Software") to deal in the Data Files or Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, and/or sell copies of the Data Files or Software, and +to permit persons to whom the Data Files or Software are furnished to do +so, provided that (a) the above copyright notice(s) and this permission +notice appear with all copies of the Data Files or Software, (b) both the +above copyright notice(s) and this permission notice appear in associated +documentation, and (c) there is clear notice in each modified Data File or +in the Software as well as in the documentation associated with the Data +File(s) or Software that the data or software has been modified. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR +CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF +USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + +Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be +registered in some jurisdictions. All other trademarks and registered +trademarks mentioned herein are the property of their respective owners. + +-------------------------------------------------------------------------------- From 9ffebc36be071484b38cde0575793ab411a957ab Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Thu, 25 Jun 2026 20:51:47 -0400 Subject: [PATCH 3/6] docs(string_util): clarify case-mapping semantics in ToLower/ToUpper ToLower: note it uses Unicode simple (1:1) case mapping and document where it diverges from Java's full toLowerCase(Locale.ROOT) (e.g. U+0130). ToUpper: spell out the ASCII-only behavior and why no Unicode variant is provided. Also document EqualsIgnoreCase inheriting ToLower's mapping. Addresses API review comments on #760. --- src/iceberg/util/string_util.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index afb28bf2a..0c637803a 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -40,22 +40,34 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: - /// \brief Lower-case a UTF-8 string using Unicode simple case mapping. + /// \brief Lower-case a UTF-8 string using Unicode simple (1:1) case mapping. /// - /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with - /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged. + /// Intended for case-insensitive name matching, similar to Iceberg Java's + /// toLowerCase(Locale.ROOT). The mapping is locale-independent, matching the intent + /// of Locale.ROOT. It uses simple (1:1) case mapping rather than Java's full case + /// mapping, so results differ for a few code points; e.g. U+0130 (capital I with dot + /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above) + /// in Java. For ASCII and the large majority of letters the two agree. + /// + /// Invalid UTF-8 input is returned unchanged. /// See https://github.com/apache/iceberg-cpp/issues/613. static std::string ToLower(std::string_view str); - /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through - /// unchanged. + /// \brief Upper-case the ASCII letters (a-z) in a string; all other bytes, including + /// multi-byte UTF-8 sequences, are left unchanged. /// - /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching. + /// Deliberately ASCII-only and, unlike ToLower, not Unicode-aware. It is only used to + /// normalize ASCII enum/codec strings (e.g. "gzip" -> "GZIP", "all" -> "ALL") for + /// case-insensitive comparison. A Unicode upper-case is intentionally not provided: + /// simple case mapping would be wrong for some letters (e.g. "ß" (U+00DF) would stay + /// unchanged instead of becoming "SS"), and no caller needs it. static std::string ToUpper(std::string_view str) { return str | std::ranges::views::transform(ToUpperAscii) | std::ranges::to(); } + /// \brief Case-insensitive equality; compares the ToLower forms of both operands and + /// therefore inherits ToLower's Unicode simple-mapping behavior. static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { return ToLower(lhs) == ToLower(rhs); } From 46165c6a46ad2c06f6a101df0761f618ee32c3b8 Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Sat, 27 Jun 2026 11:27:35 -0400 Subject: [PATCH 4/6] test(string_util): add failing regression test for StartsWithIgnoreCase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The byte-slice in StartsWithIgnoreCase (str.substr(0, prefix.size()) before lowercasing) is wrong when ToLower changes byte length: "İ" (U+0130) is two bytes but lower-cases to "i", so "İx" should match prefix "i" but does not. This test pins that behavior; it fails against the current implementation and is fixed by the following commit. Relates to #760. --- src/iceberg/test/string_util_test.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index 3d4422f42..fb16a80d4 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -86,4 +86,25 @@ TEST(StringUtilsTest, EqualsIgnoreCase) { ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); } +TEST(StringUtilsTest, StartsWithIgnoreCase) { + ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("AbCdef", "abc")); + ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("abc", "ABC")); + ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("abc", "abd")); + // Empty prefix always matches; a prefix longer than the string does not. + ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("abc", "")); + ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab", "abcd")); + // Regression (#760): lower-casing can change byte length, so the prefix must not be + // matched by byte-slicing. "İ" (U+0130 = 0xC4 0xB0) lower-cases to "i", so "İx" + // starts with "i" ... + ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("\xC4\xB0x", "i")); + // ... and "i" starts with "İ" (both lower-case to "i"), which the old byte-length + // guard wrongly rejected. + ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("i", "\xC4\xB0")); + // A matching Unicode prefix: "CAFÉbar" starts with "café". + ASSERT_TRUE( + StringUtils::StartsWithIgnoreCase("CAF\xC3\x89" + "bar", + "caf\xC3\xA9")); +} + } // namespace iceberg From 819878c6b5695d8d9d972ff010a9e35338be615c Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Sat, 27 Jun 2026 11:31:00 -0400 Subject: [PATCH 5/6] fix(string_util): make StartsWithIgnoreCase handle UTF-8 length changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compare the ToLower forms of both inputs instead of byte-slicing str to prefix.size() before lowercasing. ToLower can change a string's byte length (e.g. "İ" U+0130 lower-cases to "i"), so the old slice could split a code point or wrongly reject a valid match. Makes the regression test from the previous commit pass. Relates to #760. --- src/iceberg/util/string_util.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 0c637803a..f3807364e 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -72,11 +72,14 @@ class ICEBERG_EXPORT StringUtils { return ToLower(lhs) == ToLower(rhs); } + /// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs. + /// + /// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are + /// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can + /// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"), + /// so a byte slice could split a code point or reject a valid match. static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { - if (str.size() < prefix.size()) { - return false; - } - return EqualsIgnoreCase(str.substr(0, prefix.size()), prefix); + return ToLower(str).starts_with(ToLower(prefix)); } /// \brief Count the number of code points in a UTF-8 string. From 3e063aea8542d0f807e54b31853a3522f0c6d2ed Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Fri, 3 Jul 2026 11:45:36 -0400 Subject: [PATCH 6/6] refactor: ASCII fast path + build polish --- README.md | 1 + .../IcebergThirdpartyToolchain.cmake | 36 ++++--- mkdocs/docs/getting-started.md | 1 + src/iceberg/meson.build | 15 ++- src/iceberg/test/string_util_test.cc | 97 ++++++++++++++++++- src/iceberg/util/string_util.cc | 21 +++- src/iceberg/util/string_util.h | 65 ++++++++++--- 7 files changed, 207 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 7c9a343ec..1318933d9 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,7 @@ If you experience network issues when downloading dependencies, you can customiz - `ICEBERG_AVRO_GIT_URL`: Apache Avro git repository URL - `ICEBERG_NANOARROW_URL`: Nanoarrow tarball URL - `ICEBERG_CROARING_URL`: CRoaring tarball URL +- `ICEBERG_UTF8PROC_URL`: utf8proc tarball URL - `ICEBERG_NLOHMANN_JSON_URL`: nlohmann-json tarball URL - `ICEBERG_SPDLOG_URL`: spdlog tarball URL - `ICEBERG_CPR_URL`: cpr tarball URL diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 982390332..3d3fd4f8f 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -72,6 +72,7 @@ endfunction() # ICEBERG_AVRO_GIT_URL - Apache Avro git repository URL # ICEBERG_NANOARROW_URL - Nanoarrow tarball URL # ICEBERG_CROARING_URL - CRoaring tarball URL +# ICEBERG_UTF8PROC_URL - utf8proc tarball URL # ICEBERG_NLOHMANN_JSON_URL - nlohmann-json tarball URL # ICEBERG_SPDLOG_URL - spdlog tarball URL # ICEBERG_CPR_URL - cpr tarball URL @@ -109,6 +110,20 @@ else() ) endif() +set(ICEBERG_UTF8PROC_BUILD_VERSION "2.10.0") +set(ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM + "276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e") + +if(DEFINED ENV{ICEBERG_UTF8PROC_URL}) + set(UTF8PROC_SOURCE_URL "$ENV{ICEBERG_UTF8PROC_URL}") +else() + # Use the release asset (stable bytes, matching subprojects/utf8proc.wrap) rather + # than the auto-generated tag archive, whose contents GitHub does not guarantee. + set(UTF8PROC_SOURCE_URL + "https://github.com/JuliaStrings/utf8proc/releases/download/v${ICEBERG_UTF8PROC_BUILD_VERSION}/utf8proc-${ICEBERG_UTF8PROC_BUILD_VERSION}.tar.gz" + ) +endif() + # ---------------------------------------------------------------------- # FetchContent @@ -427,20 +442,19 @@ endfunction() function(resolve_utf8proc_dependency) prepare_fetchcontent() - if(DEFINED ENV{ICEBERG_UTF8PROC_URL}) - set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}") - else() - set(UTF8PROC_URL - "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz") - endif() + # The vendored build needs no install rules; without this, CMake < 3.28 (where + # FetchContent has no EXCLUDE_FROM_ALL) would install utf8proc's headers and + # pkg-config file into the iceberg install prefix. + set(UTF8PROC_INSTALL OFF) fetchcontent_declare(utf8proc ${FC_DECLARE_COMMON_OPTIONS} - URL ${UTF8PROC_URL} - FIND_PACKAGE_ARGS - NAMES - utf8proc - CONFIG) + URL ${UTF8PROC_SOURCE_URL} + URL_HASH "SHA256=${ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM}" + FIND_PACKAGE_ARGS + NAMES + utf8proc + CONFIG) fetchcontent_makeavailable(utf8proc) if(utf8proc_SOURCE_DIR) diff --git a/mkdocs/docs/getting-started.md b/mkdocs/docs/getting-started.md index 10a1a5c90..52a04e635 100644 --- a/mkdocs/docs/getting-started.md +++ b/mkdocs/docs/getting-started.md @@ -143,6 +143,7 @@ If you experience network issues when downloading dependencies, you can override | `ICEBERG_AVRO_GIT_URL` | Apache Avro git repository | | `ICEBERG_NANOARROW_URL` | Nanoarrow tarball | | `ICEBERG_CROARING_URL` | CRoaring tarball | +| `ICEBERG_UTF8PROC_URL` | utf8proc tarball | | `ICEBERG_NLOHMANN_JSON_URL` | nlohmann-json tarball | | `ICEBERG_CPR_URL` | cpr tarball | diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index f69ce36c0..57af0ea84 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -189,15 +189,26 @@ croaring_dep = dependency('croaring', static: croaring_needs_static) nanoarrow_dep = dependency('nanoarrow') nlohmann_json_dep = dependency('nlohmann_json') spdlog_dep = dependency('spdlog') +# utf8proc's header declares its functions __declspec(dllimport) on Windows unless +# UTF8PROC_STATIC is defined, and the wrap does not propagate that define to consumers. +# Define it whenever utf8proc is linked statically, so the header's declarations match +# how it is linked. Harmless on other platforms. +utf8proc_needs_static = get_option('default_library') == 'static' +utf8proc_dep = dependency('libutf8proc', static: utf8proc_needs_static) +if utf8proc_needs_static + utf8proc_dep = declare_dependency( + compile_args: ['-DUTF8PROC_STATIC'], + dependencies: utf8proc_dep, + ) +endif zlib_dep = dependency('zlib') -utf8proc_dep = dependency('libutf8proc') iceberg_deps = [ nanoarrow_dep, nlohmann_json_dep, spdlog_dep, - zlib_dep, utf8proc_dep, + zlib_dep, ] iceberg_lib = library( diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index fb16a80d4..65d5c23cc 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -57,8 +57,31 @@ TEST(StringUtilsTest, ToLowerUnicode) { // "日本語" has no case mapping and is returned verbatim. ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); - // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring. + // ASCII prefix before the first non-ASCII byte takes the fast path; the rest goes + // through utf8proc. "ABÉ" -> "abé". + ASSERT_EQ(StringUtils::ToLower("AB\xC3\x89"), "ab\xC3\xA9"); + // An invalid UTF-8 byte (a lone 0xFF) passes through unchanged rather than erroring. ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF"); + // An invalid byte only passes through itself; the valid code points around it are + // still lower-cased ("AB" 0xFF "CÉ" -> "ab" 0xFF "cé"). + ASSERT_EQ(StringUtils::ToLower("AB\xFF" + "C\xC3\x89"), + "ab\xFF" + "c\xC3\xA9"); + // The invalid byte can abut a multi-byte code point with no ASCII between them; 0xFF + // passes through and the adjacent "É" still lower-cases to "é" (0xFF "É" -> 0xFF "é"). + ASSERT_EQ(StringUtils::ToLower("\xFF\xC3\x89"), "\xFF\xC3\xA9"); + // A truncated multi-byte sequence (0xC3 with no continuation byte) passes through + // without consuming the bytes after it. + ASSERT_EQ(StringUtils::ToLower("\xC3" + "AB"), + "\xC3" + "ab"); + // A stray continuation byte (0x80) behaves the same way. + ASSERT_EQ(StringUtils::ToLower("A\x80" + "B"), + "a\x80" + "b"); } // ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through. @@ -84,6 +107,23 @@ TEST(StringUtilsTest, EqualsIgnoreCase) { "e")); // Different letters still differ ("café" vs "cafe"). ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); + // Fallback correctness: an ASCII operand can equal a non-ASCII one once lower-cased, + // even though their raw byte lengths differ. "İ" (U+0130 = 0xC4 0xB0, two bytes) + // lower-cases to one-byte "i", so it must compare equal to "i" and "I". + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("i", "\xC4\xB0")); + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("\xC4\xB0", "I")); + // The non-ASCII byte can appear after a matching ASCII prefix ("abi" vs "abİ"). + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("abi", "ab\xC4\xB0")); + // Pure-ASCII operands that share a prefix but differ in length are not equal. + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "ab")); + // Operands containing invalid UTF-8 are still compared case-insensitively on their + // valid parts; the invalid bytes themselves compare verbatim. + ASSERT_TRUE( + StringUtils::EqualsIgnoreCase("AB\xFF" + "C", + "ab\xFF" + "c")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("\xFF", "\xFE")); } TEST(StringUtilsTest, StartsWithIgnoreCase) { @@ -105,6 +145,61 @@ TEST(StringUtilsTest, StartsWithIgnoreCase) { StringUtils::StartsWithIgnoreCase("CAF\xC3\x89" "bar", "caf\xC3\xA9")); + // Invalid UTF-8 bytes compare verbatim in the prefix as well. + ASSERT_TRUE( + StringUtils::StartsWithIgnoreCase("AB\xFF" + "x", + "ab\xFF")); + ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab\xFE", "ab\xFF")); +} + +// The ASCII fast paths in EqualsIgnoreCase / StartsWithIgnoreCase must agree with their +// documented ToLower-based semantics for every input, including length-changing case +// mappings and invalid UTF-8. Rather than enumerate cases by hand, exhaustively compare +// both functions against the ToLower oracle over all short strings built from a small +// alphabet that straddles those boundaries. This is the mechanical form of the #760 +// regression, where a fast path disagreed with ToLower on a length-changing mapping. +TEST(StringUtilsTest, IgnoreCaseAgreesWithToLowerOracle) { + // Atoms mix ASCII (upper/lower, including the lowercase targets of the multi-byte + // mappings) with a 2-byte code point that lower-cases to one byte ("İ" U+0130 -> "i"), + // a 3-byte one that also shrinks to one byte ("K" U+212A -> "k"), an ordinary 2-byte + // cased letter ("É"), and an invalid UTF-8 byte. + const std::vector atoms = { + "a", "I", "i", "k", "\xC4\xB0", "\xE2\x84\xAA", "\xC3\x89", "\xFF"}; + + // Build every string of 0..3 atoms, one generation (length) at a time. + std::vector inputs = {""}; + size_t generation_begin = 0; + for (int len = 0; len < 3; ++len) { + const size_t generation_end = inputs.size(); + for (size_t i = generation_begin; i < generation_end; ++i) { + for (const auto& atom : atoms) { + inputs.push_back(inputs[i] + atom); + } + } + generation_begin = generation_end; + } + + // Precompute the oracle so the O(n^2) comparison below does not re-lower each string. + std::vector lowered; + lowered.reserve(inputs.size()); + for (const auto& s : inputs) { + lowered.push_back(StringUtils::ToLower(s)); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + for (size_t j = 0; j < inputs.size(); ++j) { + EXPECT_EQ(StringUtils::EqualsIgnoreCase(inputs[i], inputs[j]), + lowered[i] == lowered[j]) + << "EqualsIgnoreCase disagreed for a=" << testing::PrintToString(inputs[i]) + << " b=" << testing::PrintToString(inputs[j]); + EXPECT_EQ(StringUtils::StartsWithIgnoreCase(inputs[i], inputs[j]), + lowered[i].starts_with(lowered[j])) + << "StartsWithIgnoreCase disagreed for str=" + << testing::PrintToString(inputs[i]) + << " prefix=" << testing::PrintToString(inputs[j]); + } + } } } // namespace iceberg diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc index 00b938a07..8b98397db 100644 --- a/src/iceberg/util/string_util.cc +++ b/src/iceberg/util/string_util.cc @@ -31,19 +31,32 @@ std::string StringUtils::ToLower(std::string_view str) { std::string result; result.reserve(str.size()); + // Lower-case ASCII bytes directly; hand non-ASCII bytes to utf8proc. The common inputs + // (modes, UUIDs, header/property names, enum-like strings) are pure ASCII and never + // touch utf8proc. utf8proc has no string-level helper, so each non-ASCII code point is + // decoded, mapped with utf8proc_tolower (simple 1:1 mapping, not casefolding), and + // re-encoded. const auto* data = reinterpret_cast(str.data()); const auto size = static_cast(str.size()); utf8proc_ssize_t offset = 0; while (offset < size) { + // An ASCII byte is a complete 1-byte code point (never a UTF-8 continuation byte), + // and utf8proc_tolower agrees with ToLowerAscii on it, so handle it without utf8proc. + if (IsAsciiByte(str[offset])) { + result.push_back(ToLowerAscii(str[offset])); + ++offset; + continue; + } utf8proc_int32_t code_point = 0; utf8proc_ssize_t consumed = utf8proc_iterate(data + offset, size - offset, &code_point); if (consumed < 0) { - // Invalid UTF-8: return the input unchanged rather than erroring. - return std::string(str); + // Invalid UTF-8: pass the offending byte through unchanged and resume decoding at + // the next byte, so the valid code points around it are still lower-cased. + result.push_back(str[offset]); + ++offset; + continue; } - // utf8proc has no string-level lower-case helper, so map and re-encode each code - // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding). const utf8proc_int32_t lowered = utf8proc_tolower(code_point); std::array encoded{}; const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data()); diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index f3807364e..62718a0ea 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,11 @@ class ICEBERG_EXPORT StringUtils { /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above) /// in Java. For ASCII and the large majority of letters the two agree. /// - /// Invalid UTF-8 input is returned unchanged. + /// Pure-ASCII input takes a byte-wise fast path; utf8proc is only invoked when a + /// non-ASCII byte (>= 0x80) is present. The function is total: it never fails, and + /// input need not be valid UTF-8. A byte that does not begin a valid UTF-8 sequence + /// is copied through unchanged and decoding resumes at the next byte, so the valid + /// code points around it are still lower-cased. /// See https://github.com/apache/iceberg-cpp/issues/613. static std::string ToLower(std::string_view str); @@ -66,19 +71,30 @@ class ICEBERG_EXPORT StringUtils { std::ranges::to(); } - /// \brief Case-insensitive equality; compares the ToLower forms of both operands and - /// therefore inherits ToLower's Unicode simple-mapping behavior. + /// \brief Case-insensitive equality using Unicode simple (1:1) case mapping. + /// + /// Equal when the ToLower forms of both operands are equal, so folding follows + /// ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte sequence: + /// ToLower passes invalid UTF-8 bytes through unchanged, so they compare verbatim. static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { - return ToLower(lhs) == ToLower(rhs); + const std::optional fast = AsciiEqualsIgnoreCase(lhs, rhs); + return fast.has_value() ? *fast : (ToLower(lhs) == ToLower(rhs)); } - /// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs. + /// \brief Case-insensitive prefix test using Unicode simple (1:1) case mapping. /// - /// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are - /// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can - /// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"), - /// so a byte slice could split a code point or reject a valid match. + /// True when the ToLower form of str starts with the ToLower form of prefix, so folding + /// follows ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte + /// sequence: ToLower passes invalid UTF-8 bytes through unchanged, so they compare + /// verbatim. static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { + if (prefix.size() <= str.size()) { + const std::optional fast = + AsciiEqualsIgnoreCase(str.substr(0, prefix.size()), prefix); + if (fast.has_value()) { + return *fast; + } + } return ToLower(str).starts_with(ToLower(prefix)); } @@ -150,11 +166,38 @@ class ICEBERG_EXPORT StringUtils { } private: - // Avoids std::toupper, which is locale-dependent and has undefined behavior for - // negative char values. + // ASCII-only case mappings. These avoid std::toupper/std::tolower, which are + // locale-dependent and have undefined behavior for negative char values. static constexpr char ToUpperAscii(char c) noexcept { return (c >= 'a' && c <= 'z') ? static_cast(c - 'a' + 'A') : c; } + static constexpr char ToLowerAscii(char c) noexcept { + return (c >= 'A' && c <= 'Z') ? static_cast(c - 'A' + 'a') : c; + } + + // True if c is a 7-bit ASCII byte (< 0x80). The cast is required because char may be + // signed, which would make bytes >= 0x80 compare as negative. + static constexpr bool IsAsciiByte(char c) noexcept { + return (static_cast(c) & 0x80) == 0; + } + + // Case-insensitive equality decided in a single byte-wise pass, without allocating. + // Returns nullopt once a byte of either operand is non-ASCII, because folding can then + // be non-ASCII and length-changing (e.g. "İ" (U+0130) -> "i"), which only ToLower + // knows. + static std::optional AsciiEqualsIgnoreCase(std::string_view a, + std::string_view b) { + const size_t n = std::min(a.size(), b.size()); + for (size_t i = 0; i < n; ++i) { + if (!IsAsciiByte(a[i]) || !IsAsciiByte(b[i])) { + return std::nullopt; + } + if (ToLowerAscii(a[i]) != ToLowerAscii(b[i])) { + return false; + } + } + return a.size() == b.size(); + } }; /// \brief Transparent hash function that supports std::string_view as lookup key