From f16a42786ace612bf04a8e63bd615bd57acf3a35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 8 Apr 2026 09:19:56 +0200
Subject: [PATCH 1/4] Optimize regexp_replace by stripping trailing .* from
 anchored patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For anchored patterns like `^...(capture)....*$` where the replacement
is `\1`, build a shorter regex (stripping trailing `.*$`) and use
`captures_read` with `CaptureLocations` for direct extraction — no
`expand()`, no `String` allocation. 2.4x improvement.
---
 .../functions/src/regex/regexpreplace.rs      | 103 +++++++++++++++---
 .../test_files/regexp/regexp_replace.slt      |  15 +++
 2 files changed, 105 insertions(+), 13 deletions(-)
diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs
index 176f704a65f35..6a14743f7cddc 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -16,6 +16,8 @@
 // under the License.
 
 //! Regex expressions
+use memchr::memchr;
+
 use arrow::array::ArrayDataBuilder;
 use arrow::array::BufferBuilder;
 use arrow::array::GenericStringArray;
@@ -199,6 +201,22 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
         .into_owned()
 }
 
+/// For anchored patterns like `^...(capture)....*$` where the replacement
+/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
+/// `captures_read` with `CaptureLocations` for direct extraction — no
+/// `expand()`, no `String` allocation.
+fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
+    if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
+        return None;
+    }
+    let short = &pattern[..pattern.len() - 3];
+    let re = Regex::new(short).ok()?;
+    if re.captures_len() != 2 {
+        return None;
+    }
+    Some(re)
+}
+
 /// Replaces substring(s) matching a PCRE-like regular expression.
 ///
 /// The full list of supported features and syntax can be found at
@@ -457,6 +475,14 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     // with rust ones.
     let replacement = regex_replace_posix_groups(replacement);
 
+    // For anchored patterns like ^...(capture)....*$, build a shorter
+    // regex and use captures_read for direct extraction.
+    let short_re = if limit == 1 {
+        try_build_short_extract_regex(&pattern, &replacement)
+    } else {
+        None
+    };
+
     let string_array_type = args[0].data_type();
     match string_array_type {
         DataType::Utf8 | DataType::LargeUtf8 => {
@@ -473,13 +499,37 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
             let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
             new_offsets.append(T::zero());
 
-            string_array.iter().for_each(|val| {
-                if let Some(val) = val {
-                    let result = re.replacen(val, limit, replacement.as_str());
-                    vals.append_slice(result.as_bytes());
-                }
-                new_offsets.append(T::from_usize(vals.len()).unwrap());
-            });
+            if let Some(ref short_re) = short_re {
+                let mut locs = short_re.capture_locations();
+                string_array.iter().for_each(|val| {
+                    if let Some(val) = val {
+                        if short_re.captures_read(&mut locs, val).is_some() {
+                            let match_end = locs.get(0).unwrap().1;
+                            if memchr(b'\n', val[match_end..].as_bytes()).is_none() {
+                                if let Some((start, end)) = locs.get(1) {
+                                    vals.append_slice(&val.as_bytes()[start..end]);
+                                }
+                            } else {
+                                // Newline in remainder: .*$ wouldn't match without 's' flag
+                                let result =
+                                    re.replacen(val, limit, replacement.as_str());
+                                vals.append_slice(result.as_bytes());
+                            }
+                        } else {
+                            vals.append_slice(val.as_bytes());
+                        }
+                    }
+                    new_offsets.append(T::from_usize(vals.len()).unwrap());
+                });
+            } else {
+                string_array.iter().for_each(|val| {
+                    if let Some(val) = val {
+                        let result = re.replacen(val, limit, replacement.as_str());
+                        vals.append_slice(result.as_bytes());
+                    }
+                    new_offsets.append(T::from_usize(vals.len()).unwrap());
+                });
+            }
 
             let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
                 .len(string_array.len())
@@ -494,12 +544,39 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
 
             let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
 
-            for val in string_view_array.iter() {
-                if let Some(val) = val {
-                    let result = re.replacen(val, limit, replacement.as_str());
-                    builder.append_value(result);
-                } else {
-                    builder.append_null();
+            if let Some(ref short_re) = short_re {
+                let mut locs = short_re.capture_locations();
+                for val in string_view_array.iter() {
+                    if let Some(val) = val {
+                        if short_re.captures_read(&mut locs, val).is_some() {
+                            let match_end = locs.get(0).unwrap().1;
+                            if memchr(b'\n', val[match_end..].as_bytes()).is_none() {
+                                if let Some((start, end)) = locs.get(1) {
+                                    builder.append_value(&val[start..end]);
+                                } else {
+                                    builder.append_value("");
+                                }
+                            } else {
+                                // Newline in remainder: .*$ wouldn't match without 's' flag
+                                let result =
+                                    re.replacen(val, limit, replacement.as_str());
+                                builder.append_value(result);
+                            }
+                        } else {
+                            builder.append_value(val);
+                        }
+                    } else {
+                        builder.append_null();
+                    }
+                }
+            } else {
+                for val in string_view_array.iter() {
+                    if let Some(val) = val {
+                        let result = re.replacen(val, limit, replacement.as_str());
+                        builder.append_value(result);
+                    } else {
+                        builder.append_null();
+                    }
                 }
             }
 
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
index a2eccfce5f695..6e85a87c613f9 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
@@ -128,6 +128,21 @@ from (values ('a'), ('b')) as tbl(col);
 NULL NULL NULL
 NULL NULL NULL
 
+# Extract domain from URL using anchored pattern with trailing .*
+# This tests that the full URL suffix is replaced, not just the matched prefix
+query T
+SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
+    ('https://www.example.com/path/to/page?q=1'),
+    ('http://test.org/foo/bar'),
+    ('https://example.com/'),
+    ('not-a-url')
+) AS t(url);
+----
+example.com
+test.org
+example.com
+not-a-url
+
 # If the overall pattern matches but capture group 1 does not participate,
 # regexp_replace(..., '\1') should substitute the empty string, not keep
 # the original input.

From b8b5d0e9593d6b63edcf9693868723becd3c11cf Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 8 Apr 2026 15:53:07 -0400
Subject: [PATCH 2/4] fix clippy

---
 datafusion/functions/src/regex/regexpreplace.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs
index 6a14743f7cddc..d865b97ae1d27 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -505,7 +505,7 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
                     if let Some(val) = val {
                         if short_re.captures_read(&mut locs, val).is_some() {
                             let match_end = locs.get(0).unwrap().1;
-                            if memchr(b'\n', val[match_end..].as_bytes()).is_none() {
+                            if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() {
                                 if let Some((start, end)) = locs.get(1) {
                                     vals.append_slice(&val.as_bytes()[start..end]);
                                 }
@@ -550,7 +550,7 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
                     if let Some(val) = val {
                         if short_re.captures_read(&mut locs, val).is_some() {
                             let match_end = locs.get(0).unwrap().1;
-                            if memchr(b'\n', val[match_end..].as_bytes()).is_none() {
+                            if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() {
                                 if let Some((start, end)) = locs.get(1) {
                                     builder.append_value(&val[start..end]);
                                 } else {

From 114eec61071647e7ab7546817a5dd6e1dcdbfcb3 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 8 Apr 2026 16:12:09 -0400
Subject: [PATCH 3/4] Add test

---
 .../test_files/regexp/regexp_replace.slt           | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
index 6e85a87c613f9..99e0b10430186 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
@@ -143,6 +143,20 @@ test.org
 example.com
 not-a-url
 
+# More than one capture group should disable the short-regex fast path.
+# This still uses replacement \1, but captures_len() will be > 2, so the
+# implementation must fall back to the normal regexp_replace path.
+query T
+SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES
+    ('https://www.example.com/path/to/page?q=1'),
+    ('http://test.org/foo/bar'),
+    ('not-a-url')
+) AS t(url);
+----
+www.example.com
+test.org
+not-a-url
+
 # If the overall pattern matches but capture group 1 does not participate,
 # regexp_replace(..., '\1') should substitute the empty string, not keep
 # the original input.

From abd37d12eefe53725a674da8e2f20f69a0d0313a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Thu, 9 Apr 2026 09:32:13 +0200
Subject: [PATCH 4/4] Update datafusion/functions/src/regex/regexpreplace.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/functions/src/regex/regexpreplace.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs
index d865b97ae1d27..fff1446322a63 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -205,6 +205,8 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
 /// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
 /// `captures_read` with `CaptureLocations` for direct extraction — no
 /// `expand()`, no `String` allocation.
+/// This pattern appears in ClickBench Q28: which uses a regexp like
+/// `^https?://(?:www\.)?([^/]+)/.*$`
 fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
     if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
         return None;