refactor: ASCII fast path + build polish

goel-skd · goel-skd · commit db1f5a8930c5 · 2026-07-03T11:45:36.000-04:00
diff --git a/README.md b/README.md
@@ -151,6 +151,7 @@ If you experience network issues when downloading dependencies, you can customiz
 - `ICEBERG_AVRO_GIT_URL`: Apache Avro git repository URL
 - `ICEBERG_NANOARROW_URL`: Nanoarrow tarball URL
 - `ICEBERG_CROARING_URL`: CRoaring tarball URL
+- `ICEBERG_UTF8PROC_URL`: utf8proc tarball URL
 - `ICEBERG_NLOHMANN_JSON_URL`: nlohmann-json tarball URL
 - `ICEBERG_SPDLOG_URL`: spdlog tarball URL
 - `ICEBERG_CPR_URL`: cpr tarball URL
diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -72,9 +72,11 @@ endfunction()
 # ICEBERG_AVRO_GIT_URL       - Apache Avro git repository URL
 # ICEBERG_NANOARROW_URL      - Nanoarrow tarball URL
 # ICEBERG_CROARING_URL       - CRoaring tarball URL
+# ICEBERG_UTF8PROC_URL       - utf8proc tarball URL
 # ICEBERG_NLOHMANN_JSON_URL  - nlohmann-json tarball URL
 # ICEBERG_SPDLOG_URL         - spdlog tarball URL
 # ICEBERG_CPR_URL            - cpr tarball URL
+# ICEBERG_SQLPP23_URL        - sqlpp23 tarball URL
 #
 # Example usage:
 #   export ICEBERG_ARROW_URL="https://your-mirror.com/apache-arrow-24.0.0.tar.gz"
@@ -109,6 +111,20 @@ else()
   )
 endif()
 
+set(ICEBERG_UTF8PROC_BUILD_VERSION "2.10.0")
+set(ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM
+    "276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e")
+
+if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
+  set(UTF8PROC_SOURCE_URL "$ENV{ICEBERG_UTF8PROC_URL}")
+else()
+  # Use the release asset (stable bytes, matching subprojects/utf8proc.wrap) rather
+  # than the auto-generated tag archive, whose contents GitHub does not guarantee.
+  set(UTF8PROC_SOURCE_URL
+      "https://github.com/JuliaStrings/utf8proc/releases/download/v${ICEBERG_UTF8PROC_BUILD_VERSION}/utf8proc-${ICEBERG_UTF8PROC_BUILD_VERSION}.tar.gz"
+  )
+endif()
+
 # ----------------------------------------------------------------------
 # FetchContent
 
@@ -427,20 +443,19 @@ endfunction()
 function(resolve_utf8proc_dependency)
   prepare_fetchcontent()
 
-  if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
-    set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
-  else()
-    set(UTF8PROC_URL
-        "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
-  endif()
+  # The vendored build needs no install rules; without this, CMake < 3.28 (where
+  # FetchContent has no EXCLUDE_FROM_ALL) would install utf8proc's headers and
+  # pkg-config file into the iceberg install prefix.
+  set(UTF8PROC_INSTALL OFF)
 
   fetchcontent_declare(utf8proc
                        ${FC_DECLARE_COMMON_OPTIONS}
-                       URL ${UTF8PROC_URL}
-                           FIND_PACKAGE_ARGS
-                           NAMES
-                           utf8proc
-                           CONFIG)
+                       URL ${UTF8PROC_SOURCE_URL}
+                       URL_HASH "SHA256=${ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM}"
+                       FIND_PACKAGE_ARGS
+                       NAMES
+                       utf8proc
+                       CONFIG)
   fetchcontent_makeavailable(utf8proc)
 
   if(utf8proc_SOURCE_DIR)
diff --git a/mkdocs/docs/getting-started.md b/mkdocs/docs/getting-started.md
@@ -143,6 +143,7 @@ If you experience network issues when downloading dependencies, you can override
 | `ICEBERG_AVRO_GIT_URL` | Apache Avro git repository |
 | `ICEBERG_NANOARROW_URL` | Nanoarrow tarball |
 | `ICEBERG_CROARING_URL` | CRoaring tarball |
+| `ICEBERG_UTF8PROC_URL` | utf8proc tarball |
 | `ICEBERG_NLOHMANN_JSON_URL` | nlohmann-json tarball |
 | `ICEBERG_CPR_URL` | cpr tarball |
 
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
@@ -189,15 +189,26 @@ croaring_dep = dependency('croaring', static: croaring_needs_static)
 nanoarrow_dep = dependency('nanoarrow')
 nlohmann_json_dep = dependency('nlohmann_json')
 spdlog_dep = dependency('spdlog')
+# utf8proc's header declares its functions __declspec(dllimport) on Windows unless
+# UTF8PROC_STATIC is defined, and the wrap does not propagate that define to consumers.
+# Define it whenever utf8proc is linked statically, so the header's declarations match
+# how it is linked. Harmless on other platforms.
+utf8proc_needs_static = get_option('default_library') == 'static'
+utf8proc_dep = dependency('libutf8proc', static: utf8proc_needs_static)
+if utf8proc_needs_static
+    utf8proc_dep = declare_dependency(
+        compile_args: ['-DUTF8PROC_STATIC'],
+        dependencies: utf8proc_dep,
+    )
+endif
 zlib_dep = dependency('zlib')
-utf8proc_dep = dependency('libutf8proc')
 
 iceberg_deps = [
     nanoarrow_dep,
     nlohmann_json_dep,
     spdlog_dep,
-    zlib_dep,
     utf8proc_dep,
+    zlib_dep,
 ]
 
 iceberg_lib = library(
diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
@@ -57,8 +57,31 @@ TEST(StringUtilsTest, ToLowerUnicode) {
   // "日本語" has no case mapping and is returned verbatim.
   ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
-  // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
+  // ASCII prefix before the first non-ASCII byte takes the fast path; the rest goes
+  // through utf8proc. "ABÉ" -> "abé".
+  ASSERT_EQ(StringUtils::ToLower("AB\xC3\x89"), "ab\xC3\xA9");
+  // An invalid UTF-8 byte (a lone 0xFF) passes through unchanged rather than erroring.
   ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
+  // An invalid byte only passes through itself; the valid code points around it are
+  // still lower-cased ("AB" 0xFF "CÉ" -> "ab" 0xFF "cé").
+  ASSERT_EQ(StringUtils::ToLower("AB\xFF"
+                                 "C\xC3\x89"),
+            "ab\xFF"
+            "c\xC3\xA9");
+  // The invalid byte can abut a multi-byte code point with no ASCII between them; 0xFF
+  // passes through and the adjacent "É" still lower-cases to "é" (0xFF "É" -> 0xFF "é").
+  ASSERT_EQ(StringUtils::ToLower("\xFF\xC3\x89"), "\xFF\xC3\xA9");
+  // A truncated multi-byte sequence (0xC3 with no continuation byte) passes through
+  // without consuming the bytes after it.
+  ASSERT_EQ(StringUtils::ToLower("\xC3"
+                                 "AB"),
+            "\xC3"
+            "ab");
+  // A stray continuation byte (0x80) behaves the same way.
+  ASSERT_EQ(StringUtils::ToLower("A\x80"
+                                 "B"),
+            "a\x80"
+            "b");
 }
 
 // ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
@@ -84,6 +107,23 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
                                     "e"));
   // Different letters still differ ("café" vs "cafe").
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
+  // Fallback correctness: an ASCII operand can equal a non-ASCII one once lower-cased,
+  // even though their raw byte lengths differ. "İ" (U+0130 = 0xC4 0xB0, two bytes)
+  // lower-cases to one-byte "i", so it must compare equal to "i" and "I".
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("i", "\xC4\xB0"));
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("\xC4\xB0", "I"));
+  // The non-ASCII byte can appear after a matching ASCII prefix ("abi" vs "abİ").
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("abi", "ab\xC4\xB0"));
+  // Pure-ASCII operands that share a prefix but differ in length are not equal.
+  ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "ab"));
+  // Operands containing invalid UTF-8 are still compared case-insensitively on their
+  // valid parts; the invalid bytes themselves compare verbatim.
+  ASSERT_TRUE(
+      StringUtils::EqualsIgnoreCase("AB\xFF"
+                                    "C",
+                                    "ab\xFF"
+                                    "c"));
+  ASSERT_FALSE(StringUtils::EqualsIgnoreCase("\xFF", "\xFE"));
 }
 
 TEST(StringUtilsTest, StartsWithIgnoreCase) {
@@ -105,6 +145,61 @@ TEST(StringUtilsTest, StartsWithIgnoreCase) {
       StringUtils::StartsWithIgnoreCase("CAF\xC3\x89"
                                         "bar",
                                         "caf\xC3\xA9"));
+  // Invalid UTF-8 bytes compare verbatim in the prefix as well.
+  ASSERT_TRUE(
+      StringUtils::StartsWithIgnoreCase("AB\xFF"
+                                        "x",
+                                        "ab\xFF"));
+  ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab\xFE", "ab\xFF"));
+}
+
+// The ASCII fast paths in EqualsIgnoreCase / StartsWithIgnoreCase must agree with their
+// documented ToLower-based semantics for every input, including length-changing case
+// mappings and invalid UTF-8. Rather than enumerate cases by hand, exhaustively compare
+// both functions against the ToLower oracle over all short strings built from a small
+// alphabet that straddles those boundaries. This is the mechanical form of the #760
+// regression, where a fast path disagreed with ToLower on a length-changing mapping.
+TEST(StringUtilsTest, IgnoreCaseAgreesWithToLowerOracle) {
+  // Atoms mix ASCII (upper/lower, including the lowercase targets of the multi-byte
+  // mappings) with a 2-byte code point that lower-cases to one byte ("İ" U+0130 -> "i"),
+  // a 3-byte one that also shrinks to one byte ("K" U+212A -> "k"), an ordinary 2-byte
+  // cased letter ("É"), and an invalid UTF-8 byte.
+  const std::vector<std::string> atoms = {
+      "a", "I", "i", "k", "\xC4\xB0", "\xE2\x84\xAA", "\xC3\x89", "\xFF"};
+
+  // Build every string of 0..3 atoms, one generation (length) at a time.
+  std::vector<std::string> inputs = {""};
+  size_t generation_begin = 0;
+  for (int len = 0; len < 3; ++len) {
+    const size_t generation_end = inputs.size();
+    for (size_t i = generation_begin; i < generation_end; ++i) {
+      for (const auto& atom : atoms) {
+        inputs.push_back(inputs[i] + atom);
+      }
+    }
+    generation_begin = generation_end;
+  }
+
+  // Precompute the oracle so the O(n^2) comparison below does not re-lower each string.
+  std::vector<std::string> lowered;
+  lowered.reserve(inputs.size());
+  for (const auto& s : inputs) {
+    lowered.push_back(StringUtils::ToLower(s));
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      EXPECT_EQ(StringUtils::EqualsIgnoreCase(inputs[i], inputs[j]),
+                lowered[i] == lowered[j])
+          << "EqualsIgnoreCase disagreed for a=" << testing::PrintToString(inputs[i])
+          << " b=" << testing::PrintToString(inputs[j]);
+      EXPECT_EQ(StringUtils::StartsWithIgnoreCase(inputs[i], inputs[j]),
+                lowered[i].starts_with(lowered[j]))
+          << "StartsWithIgnoreCase disagreed for str="
+          << testing::PrintToString(inputs[i])
+          << " prefix=" << testing::PrintToString(inputs[j]);
+    }
+  }
 }
 
 }  // namespace iceberg
diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc
@@ -31,19 +31,32 @@ std::string StringUtils::ToLower(std::string_view str) {
   std::string result;
   result.reserve(str.size());
 
+  // Lower-case ASCII bytes directly; hand non-ASCII bytes to utf8proc. The common inputs
+  // (modes, UUIDs, header/property names, enum-like strings) are pure ASCII and never
+  // touch utf8proc. utf8proc has no string-level helper, so each non-ASCII code point is
+  // decoded, mapped with utf8proc_tolower (simple 1:1 mapping, not casefolding), and
+  // re-encoded.
   const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
   const auto size = static_cast<utf8proc_ssize_t>(str.size());
   utf8proc_ssize_t offset = 0;
   while (offset < size) {
+    // An ASCII byte is a complete 1-byte code point (never a UTF-8 continuation byte),
+    // and utf8proc_tolower agrees with ToLowerAscii on it, so handle it without utf8proc.
+    if (IsAsciiByte(str[offset])) {
+      result.push_back(ToLowerAscii(str[offset]));
+      ++offset;
+      continue;
+    }
     utf8proc_int32_t code_point = 0;
     utf8proc_ssize_t consumed =
         utf8proc_iterate(data + offset, size - offset, &code_point);
     if (consumed < 0) {
-      // Invalid UTF-8: return the input unchanged rather than erroring.
-      return std::string(str);
+      // Invalid UTF-8: pass the offending byte through unchanged and resume decoding at
+      // the next byte, so the valid code points around it are still lower-cased.
+      result.push_back(str[offset]);
+      ++offset;
+      continue;
     }
-    // utf8proc has no string-level lower-case helper, so map and re-encode each code
-    // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
     const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
     std::array<utf8proc_uint8_t, 4> encoded{};
     const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <cerrno>
 #include <charconv>
+#include <optional>
 #include <ranges>
 #include <string>
 #include <string_view>
@@ -49,7 +50,11 @@ class ICEBERG_EXPORT StringUtils {
   /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above)
   /// in Java. For ASCII and the large majority of letters the two agree.
   ///
-  /// Invalid UTF-8 input is returned unchanged.
+  /// Pure-ASCII input takes a byte-wise fast path; utf8proc is only invoked when a
+  /// non-ASCII byte (>= 0x80) is present. The function is total: it never fails, and
+  /// input need not be valid UTF-8. A byte that does not begin a valid UTF-8 sequence
+  /// is copied through unchanged and decoding resumes at the next byte, so the valid
+  /// code points around it are still lower-cased.
   /// See https://github.com/apache/iceberg-cpp/issues/613.
   static std::string ToLower(std::string_view str);
 
@@ -66,19 +71,30 @@ class ICEBERG_EXPORT StringUtils {
            std::ranges::to<std::string>();
   }
 
-  /// \brief Case-insensitive equality; compares the ToLower forms of both operands and
-  /// therefore inherits ToLower's Unicode simple-mapping behavior.
+  /// \brief Case-insensitive equality using Unicode simple (1:1) case mapping.
+  ///
+  /// Equal when the ToLower forms of both operands are equal, so folding follows
+  /// ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte sequence:
+  /// ToLower passes invalid UTF-8 bytes through unchanged, so they compare verbatim.
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
-    return ToLower(lhs) == ToLower(rhs);
+    const std::optional<bool> fast = AsciiEqualsIgnoreCase(lhs, rhs);
+    return fast.has_value() ? *fast : (ToLower(lhs) == ToLower(rhs));
   }
 
-  /// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs.
+  /// \brief Case-insensitive prefix test using Unicode simple (1:1) case mapping.
   ///
-  /// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are
-  /// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can
-  /// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"),
-  /// so a byte slice could split a code point or reject a valid match.
+  /// True when the ToLower form of str starts with the ToLower form of prefix, so folding
+  /// follows ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte
+  /// sequence: ToLower passes invalid UTF-8 bytes through unchanged, so they compare
+  /// verbatim.
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
+    if (prefix.size() <= str.size()) {
+      const std::optional<bool> fast =
+          AsciiEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
+      if (fast.has_value()) {
+        return *fast;
+      }
+    }
     return ToLower(str).starts_with(ToLower(prefix));
   }
 
@@ -150,11 +166,38 @@ class ICEBERG_EXPORT StringUtils {
   }
 
  private:
-  // Avoids std::toupper, which is locale-dependent and has undefined behavior for
-  // negative char values.
+  // ASCII-only case mappings. These avoid std::toupper/std::tolower, which are
+  // locale-dependent and have undefined behavior for negative char values.
   static constexpr char ToUpperAscii(char c) noexcept {
     return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
   }
+  static constexpr char ToLowerAscii(char c) noexcept {
+    return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
+  }
+
+  // True if c is a 7-bit ASCII byte (< 0x80). The cast is required because char may be
+  // signed, which would make bytes >= 0x80 compare as negative.
+  static constexpr bool IsAsciiByte(char c) noexcept {
+    return (static_cast<unsigned char>(c) & 0x80) == 0;
+  }
+
+  // Case-insensitive equality decided in a single byte-wise pass, without allocating.
+  // Returns nullopt once a byte of either operand is non-ASCII, because folding can then
+  // be non-ASCII and length-changing (e.g. "İ" (U+0130) -> "i"), which only ToLower
+  // knows.
+  static std::optional<bool> AsciiEqualsIgnoreCase(std::string_view a,
+                                                   std::string_view b) {
+    const size_t n = std::min(a.size(), b.size());
+    for (size_t i = 0; i < n; ++i) {
+      if (!IsAsciiByte(a[i]) || !IsAsciiByte(b[i])) {
+        return std::nullopt;
+      }
+      if (ToLowerAscii(a[i]) != ToLowerAscii(b[i])) {
+        return false;
+      }
+    }
+    return a.size() == b.size();
+  }
 };
 
 /// \brief Transparent hash function that supports std::string_view as lookup key