diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 176f704a65f35..fff1446322a63 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -16,6 +16,8 @@ // under the License. //! Regex expressions +use memchr::memchr; + use arrow::array::ArrayDataBuilder; use arrow::array::BufferBuilder; use arrow::array::GenericStringArray; @@ -199,6 +201,24 @@ fn regex_replace_posix_groups(replacement: &str) -> String { .into_owned() } +/// For anchored patterns like `^...(capture)....*$` where the replacement +/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use +/// `captures_read` with `CaptureLocations` for direct extraction — no +/// `expand()`, no `String` allocation. +/// This pattern appears in ClickBench Q28: which uses a regexp like +/// `^https?://(?:www\.)?([^/]+)/.*$` +fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option { + if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") { + return None; + } + let short = &pattern[..pattern.len() - 3]; + let re = Regex::new(short).ok()?; + if re.captures_len() != 2 { + return None; + } + Some(re) +} + /// Replaces substring(s) matching a PCRE-like regular expression. /// /// The full list of supported features and syntax can be found at @@ -457,6 +477,14 @@ fn _regexp_replace_static_pattern_replace( // with rust ones. let replacement = regex_replace_posix_groups(replacement); + // For anchored patterns like ^...(capture)....*$, build a shorter + // regex and use captures_read for direct extraction. + let short_re = if limit == 1 { + try_build_short_extract_regex(&pattern, &replacement) + } else { + None + }; + let string_array_type = args[0].data_type(); match string_array_type { DataType::Utf8 | DataType::LargeUtf8 => { @@ -473,13 +501,37 @@ fn _regexp_replace_static_pattern_replace( let mut new_offsets = BufferBuilder::::new(string_array.len() + 1); new_offsets.append(T::zero()); - string_array.iter().for_each(|val| { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - vals.append_slice(result.as_bytes()); - } - new_offsets.append(T::from_usize(vals.len()).unwrap()); - }); + if let Some(ref short_re) = short_re { + let mut locs = short_re.capture_locations(); + string_array.iter().for_each(|val| { + if let Some(val) = val { + if short_re.captures_read(&mut locs, val).is_some() { + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { + if let Some((start, end)) = locs.get(1) { + vals.append_slice(&val.as_bytes()[start..end]); + } + } else { + // Newline in remainder: .*$ wouldn't match without 's' flag + let result = + re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + } else { + vals.append_slice(val.as_bytes()); + } + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); + } else { + string_array.iter().for_each(|val| { + if let Some(val) = val { + let result = re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); + } let data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(string_array.len()) @@ -494,12 +546,39 @@ fn _regexp_replace_static_pattern_replace( let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); - for val in string_view_array.iter() { - if let Some(val) = val { - let result = re.replacen(val, limit, replacement.as_str()); - builder.append_value(result); - } else { - builder.append_null(); + if let Some(ref short_re) = short_re { + let mut locs = short_re.capture_locations(); + for val in string_view_array.iter() { + if let Some(val) = val { + if short_re.captures_read(&mut locs, val).is_some() { + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() { + if let Some((start, end)) = locs.get(1) { + builder.append_value(&val[start..end]); + } else { + builder.append_value(""); + } + } else { + // Newline in remainder: .*$ wouldn't match without 's' flag + let result = + re.replacen(val, limit, replacement.as_str()); + builder.append_value(result); + } + } else { + builder.append_value(val); + } + } else { + builder.append_null(); + } + } + } else { + for val in string_view_array.iter() { + if let Some(val) = val { + let result = re.replacen(val, limit, replacement.as_str()); + builder.append_value(result); + } else { + builder.append_null(); + } } } diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt index a2eccfce5f695..99e0b10430186 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt @@ -128,6 +128,35 @@ from (values ('a'), ('b')) as tbl(col); NULL NULL NULL NULL NULL NULL +# Extract domain from URL using anchored pattern with trailing .* +# This tests that the full URL suffix is replaced, not just the matched prefix +query T +SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES + ('https://www.example.com/path/to/page?q=1'), + ('http://test.org/foo/bar'), + ('https://example.com/'), + ('not-a-url') +) AS t(url); +---- +example.com +test.org +example.com +not-a-url + +# More than one capture group should disable the short-regex fast path. +# This still uses replacement \1, but captures_len() will be > 2, so the +# implementation must fall back to the normal regexp_replace path. +query T +SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES + ('https://www.example.com/path/to/page?q=1'), + ('http://test.org/foo/bar'), + ('not-a-url') +) AS t(url); +---- +www.example.com +test.org +not-a-url + # If the overall pattern matches but capture group 1 does not participate, # regexp_replace(..., '\1') should substitute the empty string, not keep # the original input.