From f16a42786ace612bf04a8e63bd615bd57acf3a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 8 Apr 2026 09:19:56 +0200 Subject: [PATCH 1/4] Optimize regexp_replace by stripping trailing .* from anchored patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For anchored patterns like `^...(capture)....*$` where the replacement is `\1`, build a shorter regex (stripping trailing `.*$`) and use `captures_read` with `CaptureLocations` for direct extraction — no `expand()`, no `String` allocation. 2.4x improvement. --- .../functions/src/regex/regexpreplace.rs | 103 +++++++++++++++--- .../test_files/regexp/regexp_replace.slt | 15 +++ 2 files changed, 105 insertions(+), 13 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 176f704a65f35..6a14743f7cddc 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -16,6 +16,8 @@ // under the License. //! Regex expressions +use memchr::memchr; + use arrow::array::ArrayDataBuilder; use arrow::array::BufferBuilder; use arrow::array::GenericStringArray; @@ -199,6 +201,22 @@ fn regex_replace_posix_groups(replacement: &str) -> String { .into_owned() } +/// For anchored patterns like `^...(capture)....*$` where the replacement +/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use +/// `captures_read` with `CaptureLocations` for direct extraction — no +/// `expand()`, no `String` allocation. +fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option { + if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") { + return None; + } + let short = &pattern[..pattern.len() - 3]; + let re = Regex::new(short).ok()?; + if re.captures_len() != 2 { + return None; + } + Some(re) +} + /// Replaces substring(s) matching a PCRE-like regular expression. /// /// The full list of supported features and syntax can be found at @@ -457,6 +475,14 @@ fn _regexp_replace_static_pattern_replace( // with rust ones. let replacement = regex_replace_posix_groups(replacement); + // For anchored patterns like ^...(capture)....*$, build a shorter + // regex and use captures_read for direct extraction. + let short_re = if limit == 1 { + try_build_short_extract_regex(&pattern, &replacement) + } else { + None + }; + let string_array_type = args[0].data_type(); match string_array_type { DataType::Utf8 | DataType::LargeUtf8 => { @@ -473,13 +499,37 @@ fn _regexp_replace_static_pattern_replace( let mut new_offsets = BufferBuilder::::new(string_array.len() + 1); new_offsets.append(T::zero()); - string_array.iter().for_each(|val| { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - vals.append_slice(result.as_bytes()); - } - new_offsets.append(T::from_usize(vals.len()).unwrap()); - }); + if let Some(ref short_re) = short_re { + let mut locs = short_re.capture_locations(); + string_array.iter().for_each(|val| { + if let Some(val) = val { + if short_re.captures_read(&mut locs, val).is_some() { + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', val[match_end..].as_bytes()).is_none() { + if let Some((start, end)) = locs.get(1) { + vals.append_slice(&val.as_bytes()[start..end]); + } + } else { + // Newline in remainder: .*$ wouldn't match without 's' flag + let result = + re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + } else { + vals.append_slice(val.as_bytes()); + } + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); + } else { + string_array.iter().for_each(|val| { + if let Some(val) = val { + let result = re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); + } let data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(string_array.len()) @@ -494,12 +544,39 @@ fn _regexp_replace_static_pattern_replace( let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); - for val in string_view_array.iter() { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - builder.append_value(result); - } else { - builder.append_null(); + if let Some(ref short_re) = short_re { + let mut locs = short_re.capture_locations(); + for val in string_view_array.iter() { + if let Some(val) = val { + if short_re.captures_read(&mut locs, val).is_some() { + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', val[match_end..].as_bytes()).is_none() { + if let Some((start, end)) = locs.get(1) { + builder.append_value(&val[start..end]); + } else { + builder.append_value(""); + } + } else { + // Newline in remainder: .*$ wouldn't match without 's' flag + let result = + re.replacen(val, limit, replacement.as_str()); + builder.append_value(result); + } + } else { + builder.append_value(val); + } + } else { + builder.append_null(); + } + } + } else { + for val in string_view_array.iter() { + if let Some(val) = val { + let result = re.replacen(val, limit, replacement.as_str()); + builder.append_value(result); + } else { + builder.append_null(); + } } } diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt index a2eccfce5f695..6e85a87c613f9 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt @@ -128,6 +128,21 @@ from (values ('a'), ('b')) as tbl(col); NULL NULL NULL NULL NULL NULL +# Extract domain from URL using anchored pattern with trailing .* +# This tests that the full URL suffix is replaced, not just the matched prefix +query T +SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES + ('https://www.example.com/path/to/page?q=1'), + ('http://test.org/foo/bar'), + ('https://example.com/'), + ('not-a-url') +) AS t(url); +---- +example.com +test.org +example.com +not-a-url + # If the overall pattern matches but capture group 1 does not participate, # regexp_replace(..., '\1') should substitute the empty string, not keep # the original input. From b8b5d0e9593d6b63edcf9693868723becd3c11cf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Apr 2026 15:53:07 -0400 Subject: [PATCH 2/4] fix clippy --- datafusion/functions/src/regex/regexpreplace.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 6a14743f7cddc..d865b97ae1d27 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -505,7 +505,7 @@ fn _regexp_replace_static_pattern_replace( if let Some(val) = val { if short_re.captures_read(&mut locs, val).is_some() { let match_end = locs.get(0).unwrap().1; - if memchr(b'\n', val[match_end..].as_bytes()).is_none() { + if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { if let Some((start, end)) = locs.get(1) { vals.append_slice(&val.as_bytes()[start..end]); } @@ -550,7 +550,7 @@ fn _regexp_replace_static_pattern_replace( if let Some(val) = val { if short_re.captures_read(&mut locs, val).is_some() { let match_end = locs.get(0).unwrap().1; - if memchr(b'\n', val[match_end..].as_bytes()).is_none() { + if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { if let Some((start, end)) = locs.get(1) { builder.append_value(&val[start..end]); } else { From 114eec61071647e7ab7546817a5dd6e1dcdbfcb3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Apr 2026 16:12:09 -0400 Subject: [PATCH 3/4] Add test --- .../test_files/regexp/regexp_replace.slt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt index 6e85a87c613f9..99e0b10430186 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt @@ -143,6 +143,20 @@ test.org example.com not-a-url +# More than one capture group should disable the short-regex fast path. +# This still uses replacement \1, but captures_len() will be > 2, so the +# implementation must fall back to the normal regexp_replace path. +query T +SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES + ('https://www.example.com/path/to/page?q=1'), + ('http://test.org/foo/bar'), + ('not-a-url') +) AS t(url); +---- +www.example.com +test.org +not-a-url + # If the overall pattern matches but capture group 1 does not participate, # regexp_replace(..., '\1') should substitute the empty string, not keep # the original input. From 2117a33e92e7db68c4753dee0de060dc6b59334c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Apr 2026 16:35:45 -0400 Subject: [PATCH 4/4] Consolidate special case regexp_match logic --- .../functions/src/regex/regexpreplace.rs | 182 ++++++++++-------- 1 file changed, 100 insertions(+), 82 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d865b97ae1d27..40c74f0a6ab42 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -42,7 +42,8 @@ use datafusion_expr::{ Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; -use regex::Regex; +use regex::{CaptureLocations, Regex}; +use std::borrow::Cow; use std::collections::HashMap; use std::sync::{Arc, LazyLock}; @@ -201,6 +202,80 @@ fn regex_replace_posix_groups(replacement: &str) -> String { .into_owned() } +struct ShortRegex { + /// Shortened anchored regex used to extract capture group 1 directly. + /// See [`try_build_short_extract_regex`] for details. + short_re: Regex, + /// Reusable capture locations for `short_re` to avoid per-row allocation. + locs: CaptureLocations, +} + +/// Holds the normal compiled regex together with the optional fast path used +/// for `regexp_replace(str, '^...(capture)...*$', '\1')`. +struct OptimizedRegex { + /// Full regex used for the normal replacement path and as a correctness fallback. + re: Regex, + /// Precomputed state for the direct-extraction fast path, when applicable. + short_re: Option, +} + +impl OptimizedRegex { + /// Builds any reusable state needed by the extraction fast path. + /// + /// The fast path is only enabled for single replacements where the pattern + /// and replacement satisfy [`try_build_short_extract_regex`]. + fn new(re: Regex, limit: usize, pattern: &str, replacement: &str) -> Self { + let short_re = if limit == 1 { + try_build_short_extract_regex(pattern, replacement) + } else { + None + }; + + let short_re = short_re.map(|short_re| { + let locs = short_re.capture_locations(); + ShortRegex { short_re, locs } + }); + + Self { re, short_re } + } + + /// Applies the direct-extraction fast path when it preserves the result of + /// `Regex::replacen`; otherwise falls back to the full regex replacement. + fn replacen<'a>( + &mut self, + val: &'a str, + limit: usize, + replacement: &str, + ) -> Cow<'a, str> { + // If this pattern is not eligible for direct extraction, use the full regex. + let Some(ShortRegex { short_re, locs }) = self.short_re.as_mut() else { + return self.re.replacen(val, limit, replacement); + }; + + // If the shortened regex does not match, the original anchored regex would + // also leave the input unchanged. + if short_re.captures_read(locs, val).is_none() { + return Cow::Borrowed(val); + }; + + // `captures_read` succeeded, so the overall shortened match is present. + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', &val.as_bytes()[match_end..]).is_some() { + // If there is a newline after the match, we can't use the short + // regex since it won't match across lines. Fall back to the full + // regex replacement. + return self.re.replacen(val, limit, replacement); + }; + // The fast path only applies to `${1}` replacements, so the result is + // either capture group 1 or the empty string if that group did not match. + if let Some((start, end)) = locs.get(1) { + Cow::Borrowed(&val[start..end]) + } else { + Cow::Borrowed("") + } + } +} + /// For anchored patterns like `^...(capture)....*$` where the replacement /// is `\1`, build a shorter regex (stripping trailing `.*$`) and use /// `captures_read` with `CaptureLocations` for direct extraction — no @@ -440,7 +515,7 @@ macro_rules! fetch_string_arg { /// hold a single Regex object for the replace operation. This also speeds /// up the pre-processing time of the replacement string, since it only /// needs to processed once. -fn _regexp_replace_static_pattern_replace( +fn regexp_replace_static_pattern_replace( args: &[ArrayRef], ) -> Result { let array_size = args[0].len(); @@ -475,13 +550,7 @@ fn _regexp_replace_static_pattern_replace( // with rust ones. let replacement = regex_replace_posix_groups(replacement); - // For anchored patterns like ^...(capture)....*$, build a shorter - // regex and use captures_read for direct extraction. - let short_re = if limit == 1 { - try_build_short_extract_regex(&pattern, &replacement) - } else { - None - }; + let mut opt_re = OptimizedRegex::new(re, limit, &pattern, &replacement); let string_array_type = args[0].data_type(); match string_array_type { @@ -499,37 +568,13 @@ fn _regexp_replace_static_pattern_replace( let mut new_offsets = BufferBuilder::::new(string_array.len() + 1); new_offsets.append(T::zero()); - if let Some(ref short_re) = short_re { - let mut locs = short_re.capture_locations(); - string_array.iter().for_each(|val| { - if let Some(val) = val { - if short_re.captures_read(&mut locs, val).is_some() { - let match_end = locs.get(0).unwrap().1; - if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { - if let Some((start, end)) = locs.get(1) { - vals.append_slice(&val.as_bytes()[start..end]); - } - } else { - // Newline in remainder: .*$ wouldn't match without 's' flag - let result = - re.replacen(val, limit, replacement.as_str()); - vals.append_slice(result.as_bytes()); - } - } else { - vals.append_slice(val.as_bytes()); - } - } - new_offsets.append(T::from_usize(vals.len()).unwrap()); - }); - } else { - string_array.iter().for_each(|val| { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - vals.append_slice(result.as_bytes()); - } - new_offsets.append(T::from_usize(vals.len()).unwrap()); - }); - } + string_array.iter().for_each(|val| { + if let Some(val) = val { + let result = opt_re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); let data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(string_array.len()) @@ -544,39 +589,12 @@ fn _regexp_replace_static_pattern_replace( let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); - if let Some(ref short_re) = short_re { - let mut locs = short_re.capture_locations(); - for val in string_view_array.iter() { - if let Some(val) = val { - if short_re.captures_read(&mut locs, val).is_some() { - let match_end = locs.get(0).unwrap().1; - if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { - if let Some((start, end)) = locs.get(1) { - builder.append_value(&val[start..end]); - } else { - builder.append_value(""); - } - } else { - // Newline in remainder: .*$ wouldn't match without 's' flag - let result = - re.replacen(val, limit, replacement.as_str()); - builder.append_value(result); - } - } else { - builder.append_value(val); - } - } else { - builder.append_null(); - } - } - } else { - for val in string_view_array.iter() { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - builder.append_value(result); - } else { - builder.append_null(); - } + for val in string_view_array.iter() { + if let Some(val) = val { + let result = opt_re.replacen(val, limit, replacement.as_str()); + builder.append_value(result.as_ref()); + } else { + builder.append_null(); } } @@ -653,7 +671,7 @@ fn specialize_regexp_replace( arg.to_array(expansion_len) }) .collect::>>()?; - _regexp_replace_static_pattern_replace::(&args) + regexp_replace_static_pattern_replace::(&args) } // If there are no specialized implementations, we'll fall back to the @@ -787,7 +805,7 @@ mod tests { let replacements = <$T>::from(replacement); let expected = <$T>::from(expected); - let re = _regexp_replace_static_pattern_replace::<$O>(&[ + let re = regexp_replace_static_pattern_replace::<$O>(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -832,7 +850,7 @@ mod tests { let flags = StringArray::from(vec!["i"; 5]); let expected = <$T>::from(expected); - let re = _regexp_replace_static_pattern_replace::<$O>(&[ + let re = regexp_replace_static_pattern_replace::<$O>(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -864,7 +882,7 @@ mod tests { let replacements = StringArray::from(vec!["foo"; 5]); let expected = StringArray::from(vec![None::<&str>; 5]); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -881,7 +899,7 @@ mod tests { let replacements = StringArray::from(Vec::>::new()); let expected = StringArray::from(Vec::>::new()); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -899,7 +917,7 @@ mod tests { let flags = StringArray::from(vec![None::<&str>; 5]); let expected = StringArray::from(vec![None::<&str>; 5]); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -918,7 +936,7 @@ mod tests { let patterns = StringArray::from(vec!["["; 5]); let replacements = StringArray::from(vec!["foo"; 5]); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -955,7 +973,7 @@ mod tests { Some("c"), ]); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements), @@ -983,7 +1001,7 @@ mod tests { let replacements = StringArray::from(vec!["foo"; 1]); let expected = StringArray::from(vec![Some("b"), None, Some("foo"), None, None]); - let re = _regexp_replace_static_pattern_replace::(&[ + let re = regexp_replace_static_pattern_replace::(&[ Arc::new(values), Arc::new(patterns), Arc::new(replacements),