From f42e2dac535f2ef86788288518f8ffef08bdc429 Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Thu, 18 Jun 2026 21:35:54 -0400
Subject: [PATCH 1/6] feat(string_util): Unicode-aware ToLower via utf8proc

Replace the ASCII-only ToLower with utf8proc simple case mapping so
case-insensitive name handling matches Iceberg Java's
toLowerCase(Locale.ROOT). ToUpper stays ASCII-only since it is not used
for name matching. EqualsIgnoreCase now compares lowercased forms.

Wire utf8proc into both the CMake (vendored/system) and Meson builds.

See https://github.com/apache/iceberg-cpp/issues/613.
---
 .../IcebergThirdpartyToolchain.cmake          | 57 +++++++++++++++++++
 src/iceberg/CMakeLists.txt                    |  8 ++-
 src/iceberg/meson.build                       |  9 ++-
 src/iceberg/test/string_util_test.cc          | 45 ++++++++++-----
 src/iceberg/util/string_util.cc               | 31 ++++++++++
 src/iceberg/util/string_util.h                | 33 +++++------
 subprojects/utf8proc.wrap                     | 30 ++++++++++
 7 files changed, 177 insertions(+), 36 deletions(-)
 create mode 100644 subprojects/utf8proc.wrap

diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake
index 8e10fd8ec..982390332 100644
--- a/cmake_modules/IcebergThirdpartyToolchain.cmake
+++ b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -421,6 +421,62 @@ function(resolve_croaring_dependency)
       PARENT_SCOPE)
 endfunction()
 
+# ----------------------------------------------------------------------
+# utf8proc
+
+function(resolve_utf8proc_dependency)
+  prepare_fetchcontent()
+
+  if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
+    set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
+  else()
+    set(UTF8PROC_URL
+        "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
+  endif()
+
+  fetchcontent_declare(utf8proc
+                       ${FC_DECLARE_COMMON_OPTIONS}
+                       URL ${UTF8PROC_URL}
+                           FIND_PACKAGE_ARGS
+                           NAMES
+                           utf8proc
+                           CONFIG)
+  fetchcontent_makeavailable(utf8proc)
+
+  if(utf8proc_SOURCE_DIR)
+    if(NOT TARGET utf8proc::utf8proc)
+      add_library(utf8proc::utf8proc INTERFACE IMPORTED)
+      target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc)
+      target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR})
+    endif()
+
+    set(UTF8PROC_VENDORED TRUE)
+    # utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which
+    # install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc
+    # is a private dependency, so installed consumers never need its headers.
+    set_target_properties(utf8proc
+                          PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc"
+                                     POSITION_INDEPENDENT_CODE ON
+                                     INTERFACE_INCLUDE_DIRECTORIES
+                                     "$<BUILD_INTERFACE:${utf8proc_SOURCE_DIR}>")
+    install(TARGETS utf8proc
+            EXPORT iceberg_targets
+            RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}"
+            ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}"
+            LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}")
+  else()
+    set(UTF8PROC_VENDORED FALSE)
+    list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc)
+  endif()
+
+  set(ICEBERG_SYSTEM_DEPENDENCIES
+      ${ICEBERG_SYSTEM_DEPENDENCIES}
+      PARENT_SCOPE)
+  set(UTF8PROC_VENDORED
+      ${UTF8PROC_VENDORED}
+      PARENT_SCOPE)
+endfunction()
+
 # ----------------------------------------------------------------------
 # nlohmann-json
 
@@ -719,6 +775,7 @@ endfunction()
 resolve_zlib_dependency()
 resolve_nanoarrow_dependency()
 resolve_croaring_dependency()
+resolve_utf8proc_dependency()
 resolve_nlohmann_json_dependency()
 resolve_spdlog_dependency()
 
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 04a9322a1..a14c52729 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -145,23 +145,27 @@ list(APPEND
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_SHARED_BUILD_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_STATIC_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 list(APPEND
      ICEBERG_SHARED_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 
 add_iceberg_lib(iceberg
                 SOURCES
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index f0b103828..f69ce36c0 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow')
 nlohmann_json_dep = dependency('nlohmann_json')
 spdlog_dep = dependency('spdlog')
 zlib_dep = dependency('zlib')
+utf8proc_dep = dependency('libutf8proc')
 
-iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep]
+iceberg_deps = [
+    nanoarrow_dep,
+    nlohmann_json_dep,
+    spdlog_dep,
+    zlib_dep,
+    utf8proc_dep,
+]
 
 iceberg_lib = library(
     'iceberg',
diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
index a3fd03760..3d4422f42 100644
--- a/src/iceberg/test/string_util_test.cc
+++ b/src/iceberg/test/string_util_test.cc
@@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) {
   ASSERT_EQ(StringUtils::ToUpper("123"), "123");
 }
 
-// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored
-// in a signed char. Only ASCII letters are converted; multibyte bytes pass through
-// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test
-// does not depend on the source-file encoding. See
-// https://github.com/apache/iceberg-cpp/issues/613.
-TEST(StringUtilsTest, NonAsciiPassThrough) {
-  // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change).
-  ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve");
-  // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged).
-  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
-  // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim.
+// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not
+// depend on the source-file encoding. An escape is split before a following hex digit
+// (e.g. "...\x9E" "E") so the \x does not absorb it.
+// See https://github.com/apache/iceberg-cpp/issues/613.
+TEST(StringUtilsTest, ToLowerUnicode) {
+  // "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9).
+  ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9");
+  // "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss"
+  // as casefolding would produce.
+  ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E"
+                                 "E"),
+            "gro\xC3\x9F"
+            "e");
+  // "日本語" has no case mapping and is returned verbatim.
   ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
+  // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
+  ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
+}
+
+// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
+TEST(StringUtilsTest, ToUpperAsciiOnly) {
+  // "café" -> "CAFé" (é stays unchanged).
+  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
   ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
 }
@@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
   ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", ""));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd"));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd"));
-  // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café")
-  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9"));
-  // "café" vs "cafe": the multibyte é differs from ASCII 'e'.
+  // Unicode-aware: "CAFÉ" matches "café".
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9"));
+  // "GROẞE" matches "große" under lowercasing (ẞ -> ß).
+  ASSERT_TRUE(
+      StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E"
+                                    "E",
+                                    "gro\xC3\x9F"
+                                    "e"));
+  // Different letters still differ ("café" vs "cafe").
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
 }
 
diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc
index 0454a62b5..00b938a07 100644
--- a/src/iceberg/util/string_util.cc
+++ b/src/iceberg/util/string_util.cc
@@ -19,10 +19,41 @@
 
 #include "iceberg/util/string_util.h"
 
+#include <utf8proc.h>
+
+#include <array>
+
 #include "iceberg/util/macros.h"
 
 namespace iceberg {
 
+std::string StringUtils::ToLower(std::string_view str) {
+  std::string result;
+  result.reserve(str.size());
+
+  const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
+  const auto size = static_cast<utf8proc_ssize_t>(str.size());
+  utf8proc_ssize_t offset = 0;
+  while (offset < size) {
+    utf8proc_int32_t code_point = 0;
+    utf8proc_ssize_t consumed =
+        utf8proc_iterate(data + offset, size - offset, &code_point);
+    if (consumed < 0) {
+      // Invalid UTF-8: return the input unchanged rather than erroring.
+      return std::string(str);
+    }
+    // utf8proc has no string-level lower-case helper, so map and re-encode each code
+    // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
+    const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
+    std::array<utf8proc_uint8_t, 4> encoded{};
+    const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
+    result.append(reinterpret_cast<const char*>(encoded.data()),
+                  static_cast<size_t>(written));
+    offset += consumed;
+  }
+  return result;
+}
+
 Result<std::vector<uint8_t>> StringUtils::HexStringToBytes(std::string_view hex) {
   if (hex.size() % 2 != 0) [[unlikely]] {
     return InvalidArgument("Hex string must have even length, got: {}", hex.size());
diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
index 01b6087b8..afb28bf2a 100644
--- a/src/iceberg/util/string_util.h
+++ b/src/iceberg/util/string_util.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <algorithm>
-#include <cctype>
 #include <cerrno>
 #include <charconv>
 #include <ranges>
@@ -41,22 +40,24 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };
 
 class ICEBERG_EXPORT StringUtils {
  public:
-  // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII
-  // (multibyte UTF-8) bytes, are passed through unchanged.
-  // See https://github.com/apache/iceberg-cpp/issues/613.
-  static std::string ToLower(std::string_view str) {
-    return str | std::ranges::views::transform(ToLowerAscii) |
-           std::ranges::to<std::string>();
-  }
-
+  /// \brief Lower-case a UTF-8 string using Unicode simple case mapping.
+  ///
+  /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with
+  /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged.
+  /// See https://github.com/apache/iceberg-cpp/issues/613.
+  static std::string ToLower(std::string_view str);
+
+  /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through
+  /// unchanged.
+  ///
+  /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching.
   static std::string ToUpper(std::string_view str) {
     return str | std::ranges::views::transform(ToUpperAscii) |
            std::ranges::to<std::string>();
   }
 
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
-    return std::ranges::equal(
-        lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); });
+    return ToLower(lhs) == ToLower(rhs);
   }
 
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
@@ -134,14 +135,8 @@ class ICEBERG_EXPORT StringUtils {
   }
 
  private:
-  // ASCII-only case conversion using explicit range checks rather than
-  // std::tolower/std::toupper. This is independent of the current C locale and never
-  // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It
-  // also sidesteps the undefined behavior of passing a negative char to <cctype>.
-  static constexpr char ToLowerAscii(char c) noexcept {
-    return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
-  }
-
+  // Avoids std::toupper, which is locale-dependent and has undefined behavior for
+  // negative char values.
   static constexpr char ToUpperAscii(char c) noexcept {
     return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
   }
diff --git a/subprojects/utf8proc.wrap b/subprojects/utf8proc.wrap
new file mode 100644
index 000000000..9b33b3bea
--- /dev/null
+++ b/subprojects/utf8proc.wrap
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[wrap-file]
+directory = utf8proc-2.10.0
+source_url = https://github.com/JuliaStrings/utf8proc/releases/download/v2.10.0/utf8proc-2.10.0.tar.gz
+source_filename = utf8proc-2.10.0.tar.gz
+source_hash = 276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e
+patch_filename = utf8proc_2.10.0-1_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/utf8proc_2.10.0-1/get_patch
+patch_hash = be16c4514603e922f9636045699fe1a6f844d340b9b7c14b809e47253b06a844
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/utf8proc_2.10.0-1/utf8proc-2.10.0.tar.gz
+wrapdb_version = 2.10.0-1
+
+[provide]
+libutf8proc = utf8proc_dep

From b8639d6801e2570a6a3ba9d05e53fd21797218dc Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Thu, 18 Jun 2026 23:51:09 -0400
Subject: [PATCH 2/6] Add license info to LICENSE

---
 LICENSE | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/LICENSE b/LICENSE
index 374b7fc58..8d8d5ff4e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -228,3 +228,95 @@ Home page: https://arrow.apache.org/
 License: https://www.apache.org/licenses/LICENSE-2.0
 
 --------------------------------------------------------------------------------
+
+This product bundles utf8proc, which is available under the MIT License:
+
+utf8proc is a software package originally developed by Jan Behrens and the rest
+of the Public Software Group, now maintained by the Julia-language developers.
+All new work on the utf8proc library is licensed under the MIT "expat" license:
+
+Copyright (c) 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas
+Fonseca, and other contributors listed in the git history.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+The original utf8proc is licensed under the same MIT "expat" license:
+
+Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+utf8proc also contains data derived from the Unicode data files. The following
+license applies to that data:
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed under
+the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of the Unicode data files and any associated documentation (the "Data
+Files") or Unicode software and any associated documentation (the
+"Software") to deal in the Data Files or Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, and/or sell copies of the Data Files or Software, and
+to permit persons to whom the Data Files or Software are furnished to do
+so, provided that (a) the above copyright notice(s) and this permission
+notice appear with all copies of the Data Files or Software, (b) both the
+above copyright notice(s) and this permission notice appear in associated
+documentation, and (c) there is clear notice in each modified Data File or
+in the Software as well as in the documentation associated with the Data
+File(s) or Software that the data or software has been modified.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
+CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
+
+Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
+registered in some jurisdictions. All other trademarks and registered
+trademarks mentioned herein are the property of their respective owners.
+
+--------------------------------------------------------------------------------

From 9ffebc36be071484b38cde0575793ab411a957ab Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Thu, 25 Jun 2026 20:51:47 -0400
Subject: [PATCH 3/6] docs(string_util): clarify case-mapping semantics in
 ToLower/ToUpper

ToLower: note it uses Unicode simple (1:1) case mapping and document where
it diverges from Java's full toLowerCase(Locale.ROOT) (e.g. U+0130). ToUpper:
spell out the ASCII-only behavior and why no Unicode variant is provided.
Also document EqualsIgnoreCase inheriting ToLower's mapping.

Addresses API review comments on #760.
---
 src/iceberg/util/string_util.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
index afb28bf2a..0c637803a 100644
--- a/src/iceberg/util/string_util.h
+++ b/src/iceberg/util/string_util.h
@@ -40,22 +40,34 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };
 
 class ICEBERG_EXPORT StringUtils {
  public:
-  /// \brief Lower-case a UTF-8 string using Unicode simple case mapping.
+  /// \brief Lower-case a UTF-8 string using Unicode simple (1:1) case mapping.
   ///
-  /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with
-  /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged.
+  /// Intended for case-insensitive name matching, similar to Iceberg Java's
+  /// toLowerCase(Locale.ROOT). The mapping is locale-independent, matching the intent
+  /// of Locale.ROOT. It uses simple (1:1) case mapping rather than Java's full case
+  /// mapping, so results differ for a few code points; e.g. U+0130 (capital I with dot
+  /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above)
+  /// in Java. For ASCII and the large majority of letters the two agree.
+  ///
+  /// Invalid UTF-8 input is returned unchanged.
   /// See https://github.com/apache/iceberg-cpp/issues/613.
   static std::string ToLower(std::string_view str);
 
-  /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through
-  /// unchanged.
+  /// \brief Upper-case the ASCII letters (a-z) in a string; all other bytes, including
+  /// multi-byte UTF-8 sequences, are left unchanged.
   ///
-  /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching.
+  /// Deliberately ASCII-only and, unlike ToLower, not Unicode-aware. It is only used to
+  /// normalize ASCII enum/codec strings (e.g. "gzip" -> "GZIP", "all" -> "ALL") for
+  /// case-insensitive comparison. A Unicode upper-case is intentionally not provided:
+  /// simple case mapping would be wrong for some letters (e.g. "ß" (U+00DF) would stay
+  /// unchanged instead of becoming "SS"), and no caller needs it.
   static std::string ToUpper(std::string_view str) {
     return str | std::ranges::views::transform(ToUpperAscii) |
            std::ranges::to<std::string>();
   }
 
+  /// \brief Case-insensitive equality; compares the ToLower forms of both operands and
+  /// therefore inherits ToLower's Unicode simple-mapping behavior.
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
     return ToLower(lhs) == ToLower(rhs);
   }

From 46165c6a46ad2c06f6a101df0761f618ee32c3b8 Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Sat, 27 Jun 2026 11:27:35 -0400
Subject: [PATCH 4/6] test(string_util): add failing regression test for
 StartsWithIgnoreCase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The byte-slice in StartsWithIgnoreCase (str.substr(0, prefix.size()) before
lowercasing) is wrong when ToLower changes byte length: "İ" (U+0130) is two
bytes but lower-cases to "i", so "İx" should match prefix "i" but does not.
This test pins that behavior; it fails against the current implementation and
is fixed by the following commit.

Relates to #760.
---
 src/iceberg/test/string_util_test.cc | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
index 3d4422f42..fb16a80d4 100644
--- a/src/iceberg/test/string_util_test.cc
+++ b/src/iceberg/test/string_util_test.cc
@@ -86,4 +86,25 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
 }
 
+TEST(StringUtilsTest, StartsWithIgnoreCase) {
+  ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("AbCdef", "abc"));
+  ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("abc", "ABC"));
+  ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("abc", "abd"));
+  // Empty prefix always matches; a prefix longer than the string does not.
+  ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("abc", ""));
+  ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab", "abcd"));
+  // Regression (#760): lower-casing can change byte length, so the prefix must not be
+  // matched by byte-slicing. "İ" (U+0130 = 0xC4 0xB0) lower-cases to "i", so "İx"
+  // starts with "i" ...
+  ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("\xC4\xB0x", "i"));
+  // ... and "i" starts with "İ" (both lower-case to "i"), which the old byte-length
+  // guard wrongly rejected.
+  ASSERT_TRUE(StringUtils::StartsWithIgnoreCase("i", "\xC4\xB0"));
+  // A matching Unicode prefix: "CAFÉbar" starts with "café".
+  ASSERT_TRUE(
+      StringUtils::StartsWithIgnoreCase("CAF\xC3\x89"
+                                        "bar",
+                                        "caf\xC3\xA9"));
+}
+
 }  // namespace iceberg

From 819878c6b5695d8d9d972ff010a9e35338be615c Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Sat, 27 Jun 2026 11:31:00 -0400
Subject: [PATCH 5/6] fix(string_util): make StartsWithIgnoreCase handle UTF-8
 length changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compare the ToLower forms of both inputs instead of byte-slicing str to
prefix.size() before lowercasing. ToLower can change a string's byte length
(e.g. "İ" U+0130 lower-cases to "i"), so the old slice could split a code
point or wrongly reject a valid match. Makes the regression test from the
previous commit pass.

Relates to #760.
---
 src/iceberg/util/string_util.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
index 0c637803a..f3807364e 100644
--- a/src/iceberg/util/string_util.h
+++ b/src/iceberg/util/string_util.h
@@ -72,11 +72,14 @@ class ICEBERG_EXPORT StringUtils {
     return ToLower(lhs) == ToLower(rhs);
   }
 
+  /// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs.
+  ///
+  /// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are
+  /// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can
+  /// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"),
+  /// so a byte slice could split a code point or reject a valid match.
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
-    if (str.size() < prefix.size()) {
-      return false;
-    }
-    return EqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
+    return ToLower(str).starts_with(ToLower(prefix));
   }
 
   /// \brief Count the number of code points in a UTF-8 string.

From 3e063aea8542d0f807e54b31853a3522f0c6d2ed Mon Sep 17 00:00:00 2001
From: Rahul Goel <goel.rahul4200@gmail.com>
Date: Fri, 3 Jul 2026 11:45:36 -0400
Subject: [PATCH 6/6] refactor: ASCII fast path + build polish

---
 README.md                                     |  1 +
 .../IcebergThirdpartyToolchain.cmake          | 36 ++++---
 mkdocs/docs/getting-started.md                |  1 +
 src/iceberg/meson.build                       | 15 ++-
 src/iceberg/test/string_util_test.cc          | 97 ++++++++++++++++++-
 src/iceberg/util/string_util.cc               | 21 +++-
 src/iceberg/util/string_util.h                | 65 ++++++++++---
 7 files changed, 207 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 7c9a343ec..1318933d9 100644
--- a/README.md
+++ b/README.md
@@ -151,6 +151,7 @@ If you experience network issues when downloading dependencies, you can customiz
 - `ICEBERG_AVRO_GIT_URL`: Apache Avro git repository URL
 - `ICEBERG_NANOARROW_URL`: Nanoarrow tarball URL
 - `ICEBERG_CROARING_URL`: CRoaring tarball URL
+- `ICEBERG_UTF8PROC_URL`: utf8proc tarball URL
 - `ICEBERG_NLOHMANN_JSON_URL`: nlohmann-json tarball URL
 - `ICEBERG_SPDLOG_URL`: spdlog tarball URL
 - `ICEBERG_CPR_URL`: cpr tarball URL
diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake
index 982390332..3d3fd4f8f 100644
--- a/cmake_modules/IcebergThirdpartyToolchain.cmake
+++ b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -72,6 +72,7 @@ endfunction()
 # ICEBERG_AVRO_GIT_URL       - Apache Avro git repository URL
 # ICEBERG_NANOARROW_URL      - Nanoarrow tarball URL
 # ICEBERG_CROARING_URL       - CRoaring tarball URL
+# ICEBERG_UTF8PROC_URL       - utf8proc tarball URL
 # ICEBERG_NLOHMANN_JSON_URL  - nlohmann-json tarball URL
 # ICEBERG_SPDLOG_URL         - spdlog tarball URL
 # ICEBERG_CPR_URL            - cpr tarball URL
@@ -109,6 +110,20 @@ else()
   )
 endif()
 
+set(ICEBERG_UTF8PROC_BUILD_VERSION "2.10.0")
+set(ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM
+    "276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e")
+
+if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
+  set(UTF8PROC_SOURCE_URL "$ENV{ICEBERG_UTF8PROC_URL}")
+else()
+  # Use the release asset (stable bytes, matching subprojects/utf8proc.wrap) rather
+  # than the auto-generated tag archive, whose contents GitHub does not guarantee.
+  set(UTF8PROC_SOURCE_URL
+      "https://github.com/JuliaStrings/utf8proc/releases/download/v${ICEBERG_UTF8PROC_BUILD_VERSION}/utf8proc-${ICEBERG_UTF8PROC_BUILD_VERSION}.tar.gz"
+  )
+endif()
+
 # ----------------------------------------------------------------------
 # FetchContent
 
@@ -427,20 +442,19 @@ endfunction()
 function(resolve_utf8proc_dependency)
   prepare_fetchcontent()
 
-  if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
-    set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
-  else()
-    set(UTF8PROC_URL
-        "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
-  endif()
+  # The vendored build needs no install rules; without this, CMake < 3.28 (where
+  # FetchContent has no EXCLUDE_FROM_ALL) would install utf8proc's headers and
+  # pkg-config file into the iceberg install prefix.
+  set(UTF8PROC_INSTALL OFF)
 
   fetchcontent_declare(utf8proc
                        ${FC_DECLARE_COMMON_OPTIONS}
-                       URL ${UTF8PROC_URL}
-                           FIND_PACKAGE_ARGS
-                           NAMES
-                           utf8proc
-                           CONFIG)
+                       URL ${UTF8PROC_SOURCE_URL}
+                       URL_HASH "SHA256=${ICEBERG_UTF8PROC_BUILD_SHA256_CHECKSUM}"
+                       FIND_PACKAGE_ARGS
+                       NAMES
+                       utf8proc
+                       CONFIG)
   fetchcontent_makeavailable(utf8proc)
 
   if(utf8proc_SOURCE_DIR)
diff --git a/mkdocs/docs/getting-started.md b/mkdocs/docs/getting-started.md
index 10a1a5c90..52a04e635 100644
--- a/mkdocs/docs/getting-started.md
+++ b/mkdocs/docs/getting-started.md
@@ -143,6 +143,7 @@ If you experience network issues when downloading dependencies, you can override
 | `ICEBERG_AVRO_GIT_URL` | Apache Avro git repository |
 | `ICEBERG_NANOARROW_URL` | Nanoarrow tarball |
 | `ICEBERG_CROARING_URL` | CRoaring tarball |
+| `ICEBERG_UTF8PROC_URL` | utf8proc tarball |
 | `ICEBERG_NLOHMANN_JSON_URL` | nlohmann-json tarball |
 | `ICEBERG_CPR_URL` | cpr tarball |
 
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index f69ce36c0..57af0ea84 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -189,15 +189,26 @@ croaring_dep = dependency('croaring', static: croaring_needs_static)
 nanoarrow_dep = dependency('nanoarrow')
 nlohmann_json_dep = dependency('nlohmann_json')
 spdlog_dep = dependency('spdlog')
+# utf8proc's header declares its functions __declspec(dllimport) on Windows unless
+# UTF8PROC_STATIC is defined, and the wrap does not propagate that define to consumers.
+# Define it whenever utf8proc is linked statically, so the header's declarations match
+# how it is linked. Harmless on other platforms.
+utf8proc_needs_static = get_option('default_library') == 'static'
+utf8proc_dep = dependency('libutf8proc', static: utf8proc_needs_static)
+if utf8proc_needs_static
+    utf8proc_dep = declare_dependency(
+        compile_args: ['-DUTF8PROC_STATIC'],
+        dependencies: utf8proc_dep,
+    )
+endif
 zlib_dep = dependency('zlib')
-utf8proc_dep = dependency('libutf8proc')
 
 iceberg_deps = [
     nanoarrow_dep,
     nlohmann_json_dep,
     spdlog_dep,
-    zlib_dep,
     utf8proc_dep,
+    zlib_dep,
 ]
 
 iceberg_lib = library(
diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
index fb16a80d4..65d5c23cc 100644
--- a/src/iceberg/test/string_util_test.cc
+++ b/src/iceberg/test/string_util_test.cc
@@ -57,8 +57,31 @@ TEST(StringUtilsTest, ToLowerUnicode) {
   // "日本語" has no case mapping and is returned verbatim.
   ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
-  // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
+  // ASCII prefix before the first non-ASCII byte takes the fast path; the rest goes
+  // through utf8proc. "ABÉ" -> "abé".
+  ASSERT_EQ(StringUtils::ToLower("AB\xC3\x89"), "ab\xC3\xA9");
+  // An invalid UTF-8 byte (a lone 0xFF) passes through unchanged rather than erroring.
   ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
+  // An invalid byte only passes through itself; the valid code points around it are
+  // still lower-cased ("AB" 0xFF "CÉ" -> "ab" 0xFF "cé").
+  ASSERT_EQ(StringUtils::ToLower("AB\xFF"
+                                 "C\xC3\x89"),
+            "ab\xFF"
+            "c\xC3\xA9");
+  // The invalid byte can abut a multi-byte code point with no ASCII between them; 0xFF
+  // passes through and the adjacent "É" still lower-cases to "é" (0xFF "É" -> 0xFF "é").
+  ASSERT_EQ(StringUtils::ToLower("\xFF\xC3\x89"), "\xFF\xC3\xA9");
+  // A truncated multi-byte sequence (0xC3 with no continuation byte) passes through
+  // without consuming the bytes after it.
+  ASSERT_EQ(StringUtils::ToLower("\xC3"
+                                 "AB"),
+            "\xC3"
+            "ab");
+  // A stray continuation byte (0x80) behaves the same way.
+  ASSERT_EQ(StringUtils::ToLower("A\x80"
+                                 "B"),
+            "a\x80"
+            "b");
 }
 
 // ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
@@ -84,6 +107,23 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
                                     "e"));
   // Different letters still differ ("café" vs "cafe").
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
+  // Fallback correctness: an ASCII operand can equal a non-ASCII one once lower-cased,
+  // even though their raw byte lengths differ. "İ" (U+0130 = 0xC4 0xB0, two bytes)
+  // lower-cases to one-byte "i", so it must compare equal to "i" and "I".
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("i", "\xC4\xB0"));
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("\xC4\xB0", "I"));
+  // The non-ASCII byte can appear after a matching ASCII prefix ("abi" vs "abİ").
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("abi", "ab\xC4\xB0"));
+  // Pure-ASCII operands that share a prefix but differ in length are not equal.
+  ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "ab"));
+  // Operands containing invalid UTF-8 are still compared case-insensitively on their
+  // valid parts; the invalid bytes themselves compare verbatim.
+  ASSERT_TRUE(
+      StringUtils::EqualsIgnoreCase("AB\xFF"
+                                    "C",
+                                    "ab\xFF"
+                                    "c"));
+  ASSERT_FALSE(StringUtils::EqualsIgnoreCase("\xFF", "\xFE"));
 }
 
 TEST(StringUtilsTest, StartsWithIgnoreCase) {
@@ -105,6 +145,61 @@ TEST(StringUtilsTest, StartsWithIgnoreCase) {
       StringUtils::StartsWithIgnoreCase("CAF\xC3\x89"
                                         "bar",
                                         "caf\xC3\xA9"));
+  // Invalid UTF-8 bytes compare verbatim in the prefix as well.
+  ASSERT_TRUE(
+      StringUtils::StartsWithIgnoreCase("AB\xFF"
+                                        "x",
+                                        "ab\xFF"));
+  ASSERT_FALSE(StringUtils::StartsWithIgnoreCase("ab\xFE", "ab\xFF"));
+}
+
+// The ASCII fast paths in EqualsIgnoreCase / StartsWithIgnoreCase must agree with their
+// documented ToLower-based semantics for every input, including length-changing case
+// mappings and invalid UTF-8. Rather than enumerate cases by hand, exhaustively compare
+// both functions against the ToLower oracle over all short strings built from a small
+// alphabet that straddles those boundaries. This is the mechanical form of the #760
+// regression, where a fast path disagreed with ToLower on a length-changing mapping.
+TEST(StringUtilsTest, IgnoreCaseAgreesWithToLowerOracle) {
+  // Atoms mix ASCII (upper/lower, including the lowercase targets of the multi-byte
+  // mappings) with a 2-byte code point that lower-cases to one byte ("İ" U+0130 -> "i"),
+  // a 3-byte one that also shrinks to one byte ("K" U+212A -> "k"), an ordinary 2-byte
+  // cased letter ("É"), and an invalid UTF-8 byte.
+  const std::vector<std::string> atoms = {
+      "a", "I", "i", "k", "\xC4\xB0", "\xE2\x84\xAA", "\xC3\x89", "\xFF"};
+
+  // Build every string of 0..3 atoms, one generation (length) at a time.
+  std::vector<std::string> inputs = {""};
+  size_t generation_begin = 0;
+  for (int len = 0; len < 3; ++len) {
+    const size_t generation_end = inputs.size();
+    for (size_t i = generation_begin; i < generation_end; ++i) {
+      for (const auto& atom : atoms) {
+        inputs.push_back(inputs[i] + atom);
+      }
+    }
+    generation_begin = generation_end;
+  }
+
+  // Precompute the oracle so the O(n^2) comparison below does not re-lower each string.
+  std::vector<std::string> lowered;
+  lowered.reserve(inputs.size());
+  for (const auto& s : inputs) {
+    lowered.push_back(StringUtils::ToLower(s));
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      EXPECT_EQ(StringUtils::EqualsIgnoreCase(inputs[i], inputs[j]),
+                lowered[i] == lowered[j])
+          << "EqualsIgnoreCase disagreed for a=" << testing::PrintToString(inputs[i])
+          << " b=" << testing::PrintToString(inputs[j]);
+      EXPECT_EQ(StringUtils::StartsWithIgnoreCase(inputs[i], inputs[j]),
+                lowered[i].starts_with(lowered[j]))
+          << "StartsWithIgnoreCase disagreed for str="
+          << testing::PrintToString(inputs[i])
+          << " prefix=" << testing::PrintToString(inputs[j]);
+    }
+  }
 }
 
 }  // namespace iceberg
diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc
index 00b938a07..8b98397db 100644
--- a/src/iceberg/util/string_util.cc
+++ b/src/iceberg/util/string_util.cc
@@ -31,19 +31,32 @@ std::string StringUtils::ToLower(std::string_view str) {
   std::string result;
   result.reserve(str.size());
 
+  // Lower-case ASCII bytes directly; hand non-ASCII bytes to utf8proc. The common inputs
+  // (modes, UUIDs, header/property names, enum-like strings) are pure ASCII and never
+  // touch utf8proc. utf8proc has no string-level helper, so each non-ASCII code point is
+  // decoded, mapped with utf8proc_tolower (simple 1:1 mapping, not casefolding), and
+  // re-encoded.
   const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
   const auto size = static_cast<utf8proc_ssize_t>(str.size());
   utf8proc_ssize_t offset = 0;
   while (offset < size) {
+    // An ASCII byte is a complete 1-byte code point (never a UTF-8 continuation byte),
+    // and utf8proc_tolower agrees with ToLowerAscii on it, so handle it without utf8proc.
+    if (IsAsciiByte(str[offset])) {
+      result.push_back(ToLowerAscii(str[offset]));
+      ++offset;
+      continue;
+    }
     utf8proc_int32_t code_point = 0;
     utf8proc_ssize_t consumed =
         utf8proc_iterate(data + offset, size - offset, &code_point);
     if (consumed < 0) {
-      // Invalid UTF-8: return the input unchanged rather than erroring.
-      return std::string(str);
+      // Invalid UTF-8: pass the offending byte through unchanged and resume decoding at
+      // the next byte, so the valid code points around it are still lower-cased.
+      result.push_back(str[offset]);
+      ++offset;
+      continue;
     }
-    // utf8proc has no string-level lower-case helper, so map and re-encode each code
-    // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
     const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
     std::array<utf8proc_uint8_t, 4> encoded{};
     const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
index f3807364e..62718a0ea 100644
--- a/src/iceberg/util/string_util.h
+++ b/src/iceberg/util/string_util.h
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <cerrno>
 #include <charconv>
+#include <optional>
 #include <ranges>
 #include <string>
 #include <string_view>
@@ -49,7 +50,11 @@ class ICEBERG_EXPORT StringUtils {
   /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above)
   /// in Java. For ASCII and the large majority of letters the two agree.
   ///
-  /// Invalid UTF-8 input is returned unchanged.
+  /// Pure-ASCII input takes a byte-wise fast path; utf8proc is only invoked when a
+  /// non-ASCII byte (>= 0x80) is present. The function is total: it never fails, and
+  /// input need not be valid UTF-8. A byte that does not begin a valid UTF-8 sequence
+  /// is copied through unchanged and decoding resumes at the next byte, so the valid
+  /// code points around it are still lower-cased.
   /// See https://github.com/apache/iceberg-cpp/issues/613.
   static std::string ToLower(std::string_view str);
 
@@ -66,19 +71,30 @@ class ICEBERG_EXPORT StringUtils {
            std::ranges::to<std::string>();
   }
 
-  /// \brief Case-insensitive equality; compares the ToLower forms of both operands and
-  /// therefore inherits ToLower's Unicode simple-mapping behavior.
+  /// \brief Case-insensitive equality using Unicode simple (1:1) case mapping.
+  ///
+  /// Equal when the ToLower forms of both operands are equal, so folding follows
+  /// ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte sequence:
+  /// ToLower passes invalid UTF-8 bytes through unchanged, so they compare verbatim.
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
-    return ToLower(lhs) == ToLower(rhs);
+    const std::optional<bool> fast = AsciiEqualsIgnoreCase(lhs, rhs);
+    return fast.has_value() ? *fast : (ToLower(lhs) == ToLower(rhs));
   }
 
-  /// \brief Case-insensitive prefix test, comparing the ToLower forms of both inputs.
+  /// \brief Case-insensitive prefix test using Unicode simple (1:1) case mapping.
   ///
-  /// Inherits ToLower's Unicode simple-mapping behavior. The whole strings are
-  /// lower-cased rather than byte-slicing str to prefix.size(), because ToLower can
-  /// change a string's byte length (e.g. "İ" (U+0130) is two bytes but maps to "i"),
-  /// so a byte slice could split a code point or reject a valid match.
+  /// True when the ToLower form of str starts with the ToLower form of prefix, so folding
+  /// follows ToLower's rules (e.g. "İ" (U+0130) folds to "i"). Defined for any byte
+  /// sequence: ToLower passes invalid UTF-8 bytes through unchanged, so they compare
+  /// verbatim.
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
+    if (prefix.size() <= str.size()) {
+      const std::optional<bool> fast =
+          AsciiEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
+      if (fast.has_value()) {
+        return *fast;
+      }
+    }
     return ToLower(str).starts_with(ToLower(prefix));
   }
 
@@ -150,11 +166,38 @@ class ICEBERG_EXPORT StringUtils {
   }
 
  private:
-  // Avoids std::toupper, which is locale-dependent and has undefined behavior for
-  // negative char values.
+  // ASCII-only case mappings. These avoid std::toupper/std::tolower, which are
+  // locale-dependent and have undefined behavior for negative char values.
   static constexpr char ToUpperAscii(char c) noexcept {
     return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
   }
+  static constexpr char ToLowerAscii(char c) noexcept {
+    return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
+  }
+
+  // True if c is a 7-bit ASCII byte (< 0x80). The cast is required because char may be
+  // signed, which would make bytes >= 0x80 compare as negative.
+  static constexpr bool IsAsciiByte(char c) noexcept {
+    return (static_cast<unsigned char>(c) & 0x80) == 0;
+  }
+
+  // Case-insensitive equality decided in a single byte-wise pass, without allocating.
+  // Returns nullopt once a byte of either operand is non-ASCII, because folding can then
+  // be non-ASCII and length-changing (e.g. "İ" (U+0130) -> "i"), which only ToLower
+  // knows.
+  static std::optional<bool> AsciiEqualsIgnoreCase(std::string_view a,
+                                                   std::string_view b) {
+    const size_t n = std::min(a.size(), b.size());
+    for (size_t i = 0; i < n; ++i) {
+      if (!IsAsciiByte(a[i]) || !IsAsciiByte(b[i])) {
+        return std::nullopt;
+      }
+      if (ToLowerAscii(a[i]) != ToLowerAscii(b[i])) {
+        return false;
+      }
+    }
+    return a.size() == b.size();
+  }
 };
 
 /// \brief Transparent hash function that supports std::string_view as lookup key