diff --git a/CHANGELOG.md b/CHANGELOG.md index 4589a08..d688b01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,21 @@ All notable changes to this project will be documented in this file. It uses the [Semantic Versioning]: https://semver.org/spec/v2.0.0.html "Semantic Versioning 2.0.0" -## [v0.1.2] — Unreleased +## [v0.2.0] — Unreleased -Fixed memory safety in C++ wrapper. +### ⚡ Improvements + +* Added `re2extractallgroupshorizontal`, `re2extractallgroupsvertical`, + `re2regexpquotemeta`, and `re2splitbyregexp` (CH-compatible). Each gets a + `bytea` overload alongside `text`. +* Fixed memory safety in C++ wrapper. + +### 📔 Notes + +* Run `ALTER EXTENSION re2 UPDATE TO '0.2'` to expose the new functions on + existing databases. - [v0.1.2]: https://github.com/clickhouse/pg_re2/compare/v0.1.1...v0.1.2 + [v0.2.0]: https://github.com/clickhouse/pg_re2/compare/v0.1.1...v0.2.0 ## [v0.1.1] — 2026-04-16 diff --git a/META.json b/META.json index 7f8cc4a..2309ba7 100644 --- a/META.json +++ b/META.json @@ -1,7 +1,7 @@ { "name": "re2", "abstract": "ClickHouse-compatible regex functions using RE2", - "version": "0.1.2", + "version": "0.2.0", "maintainer": "Philip Dubé", "license": "postgresql", "provides": { @@ -9,7 +9,7 @@ "abstract": "ClickHouse-compatible regex functions using RE2", "docfile": "doc/re2.md", "file": "re2.control", - "version": "0.1.2" + "version": "0.2.0" } }, "prereqs": { diff --git a/Makefile b/Makefile index 478a252..e5f8b43 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ EXTVERSION = $(shell grep -m 1 'default_version' re2.control | \ DISTVERSION = $(shell grep -m 1 '^[[:space:]]\{2\}"version":' META.json | \ sed -e 's/[[:space:]]*"version":[[:space:]]*"\([^"]*\)",\{0,1\}/\1/') -DATA = sql/$(EXTENSION)--$(EXTVERSION).sql +DATA = $(wildcard sql/$(EXTENSION)--*.sql) MODULE_big = $(EXTENSION) OBJS = src/pg_re2.o src/re2_cache.o src/re2_wrapper.o diff --git a/doc/re2.md b/doc/re2.md index 6e09774..52ed957 100644 --- a/doc/re2.md +++ b/doc/re2.md @@ -1,4 +1,4 @@ -re2 0.1.2 +re2 0.2.0 ========= ## Synopsis @@ -158,6 +158,112 @@ returns an empty array. **ClickHouse equivalent: [extractGroups](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extractGroups)** +### `re2extractallgroupsvertical()` ### + +Matches all non-overlapping occurrences of `:pattern` and returns a 2D array +where each inner array contains the capturing groups for one match. + +**Syntax** + +```sql +SELECT re2extractallgroupsvertical( :haystack, :pattern ); +``` + +**Parameters** + +`:haystack` +: Input string to extract from. `TEXT` or `BYTEA` + +`:pattern` +: Regular expression with at least one capturing group. `TEXT` + +**Returns `text[][]` or `bytea[][]`** + +Two-dimensional array of capturing groups, one row per match. If no matches +are found, returns an empty array. + +**ClickHouse equivalent: [extractAllGroupsVertical](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extractAllGroupsVertical)** + +### `re2extractallgroupshorizontal()` ### + +Matches all non-overlapping occurrences of `:pattern` and returns a 2D array +where each inner array contains all matches for one capturing group. + +**Syntax** + +```sql +SELECT re2extractallgroupshorizontal( :haystack, :pattern ); +``` + +**Parameters** + +`:haystack` +: Input string to extract from. `TEXT` or `BYTEA` + +`:pattern` +: Regular expression with at least one capturing group. `TEXT` + +**Returns `text[][]` or `bytea[][]`** + +Two-dimensional array of matches, one row per capturing group. If no matches +are found, returns an empty array (ClickHouse returns an array of empty +arrays, one per group; PostgreSQL cannot represent that shape, so empty +collapses to a flat empty array). + +**ClickHouse equivalent: [extractAllGroupsHorizontal](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extractAllGroupsHorizontal)** + +### `re2regexpquotemeta()` ### + +Escapes regex metacharacters with a backslash. Escaped characters: `\0`, `\\`, +`|`, `(`, `)`, `^`, `$`, `.`, `[`, `]`, `?`, `*`, `+`, `{`, `:`, `-`. + +**Syntax** + +```sql +SELECT re2regexpquotemeta( :input ); +``` + +**Parameters** + +`:input` +: String to escape. `TEXT` or `BYTEA` + +**Returns `TEXT` or `BYTEA`** matching input type. + +**ClickHouse equivalent: [regexpQuoteMeta](https://clickhouse.com/docs/sql-reference/functions/string-functions#regexpquotemeta)** + +### `re2splitbyregexp()` ### + +Splits `:haystack` into substrings using `:pattern` as a separator. If +`:pattern` is empty, the haystack is split into individual characters. If +`:max_substrings > 0`, returns at most that many substrings (extras are +dropped). + +**Syntax** + +```sql +SELECT re2splitbyregexp( :haystack, :pattern, :max_substrings DEFAULT 0 ); +``` + +**Parameters** + +`:haystack` +: Input string to split. `TEXT` or `BYTEA` + +`:pattern` +: Regular expression separator. `TEXT` + +`:max_substrings` +: Optional cap on the number of returned substrings. `0` means unlimited. + `INTEGER` + +**Returns `text[]` or `bytea[]`** matching haystack type. Note: argument order +is `(haystack, pattern)` to match the pg_re2 convention; ClickHouse uses +`splitByRegexp(pattern, haystack[, max_substrings])`. Zero-length matches are +treated as no-match (matching ClickHouse behavior). + +**ClickHouse equivalent: [splitByRegexp](https://clickhouse.com/docs/sql-reference/functions/splitting-merging-functions#splitByRegexp)** + ### `re2replaceregexpone()` ### Replaces the first occurrence of the substring matching the regular expression diff --git a/re2.control b/re2.control index 5867358..876c267 100644 --- a/re2.control +++ b/re2.control @@ -1,6 +1,6 @@ # re2 extension comment = 'ClickHouse-compatible regex functions using RE2' -default_version = '0.1' +default_version = '0.2' module_pathname = 're2' relocatable = true trusted = true diff --git a/sql/re2--0.1--0.2.sql b/sql/re2--0.1--0.2.sql new file mode 100644 index 0000000..256e047 --- /dev/null +++ b/sql/re2--0.1--0.2.sql @@ -0,0 +1,33 @@ +\echo Use "ALTER EXTENSION re2 UPDATE TO '0.2'" to load this file. \quit + +CREATE FUNCTION re2extractallgroupshorizontal(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupshorizontal' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupsvertical(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupsvertical' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpquotemeta(text) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_regexpquotemeta' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2splitbyregexp(text, text, int DEFAULT 0) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_splitbyregexp' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupshorizontal(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupshorizontal_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupsvertical(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupsvertical_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpquotemeta(bytea) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_regexpquotemeta_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2splitbyregexp(bytea, text, int DEFAULT 0) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_splitbyregexp_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; diff --git a/sql/re2--0.2.sql b/sql/re2--0.2.sql new file mode 100644 index 0000000..b23d46c --- /dev/null +++ b/sql/re2--0.2.sql @@ -0,0 +1,131 @@ +\echo Use "CREATE EXTENSION re2" to load this file. \quit + +CREATE FUNCTION re2match(text, text) RETURNS boolean +AS 'MODULE_PATHNAME', 'pgre2_match' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extract(text, text) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_extract' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractall(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractall' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpextract(text, text, int DEFAULT 1) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_regexpextract' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractgroups(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractgroups' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupshorizontal(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupshorizontal' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupsvertical(text, text) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupsvertical' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpquotemeta(text) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_regexpquotemeta' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2splitbyregexp(text, text, int DEFAULT 0) RETURNS text[] +AS 'MODULE_PATHNAME', 'pgre2_splitbyregexp' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2replaceregexpone(text, text, text) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_replaceregexpone' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2replaceregexpall(text, text, text) RETURNS text +AS 'MODULE_PATHNAME', 'pgre2_replaceregexpall' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2countmatches(text, text) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_countmatches' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2countmatchescaseinsensitive(text, text) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_countmatchescaseinsensitive' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchany(text, VARIADIC text[]) RETURNS boolean +AS 'MODULE_PATHNAME', 'pgre2_multimatchany' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchanyindex(text, VARIADIC text[]) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_multimatchanyindex' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchallindices(text, VARIADIC text[]) RETURNS integer[] +AS 'MODULE_PATHNAME', 'pgre2_multimatchallindices' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- bytea overloads (haystack can contain \0 bytes) + +CREATE FUNCTION re2match(bytea, text) RETURNS boolean +AS 'MODULE_PATHNAME', 'pgre2_match_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extract(bytea, text) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_extract_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractall(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractall_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpextract(bytea, text, int DEFAULT 1) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_regexpextract_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractgroups(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractgroups_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupshorizontal(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupshorizontal_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2extractallgroupsvertical(bytea, text) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_extractallgroupsvertical_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2regexpquotemeta(bytea) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_regexpquotemeta_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2splitbyregexp(bytea, text, int DEFAULT 0) RETURNS bytea[] +AS 'MODULE_PATHNAME', 'pgre2_splitbyregexp_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2replaceregexpone(bytea, text, text) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_replaceregexpone_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2replaceregexpall(bytea, text, text) RETURNS bytea +AS 'MODULE_PATHNAME', 'pgre2_replaceregexpall_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2countmatches(bytea, text) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_countmatches_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2countmatchescaseinsensitive(bytea, text) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_countmatchescaseinsensitive_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchany(bytea, VARIADIC text[]) RETURNS boolean +AS 'MODULE_PATHNAME', 'pgre2_multimatchany_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchanyindex(bytea, VARIADIC text[]) RETURNS integer +AS 'MODULE_PATHNAME', 'pgre2_multimatchanyindex_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION re2multimatchallindices(bytea, VARIADIC text[]) RETURNS integer[] +AS 'MODULE_PATHNAME', 'pgre2_multimatchallindices_bytea' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; diff --git a/src/pg_re2.c b/src/pg_re2.c index 5ba5bed..0dc0f64 100644 --- a/src/pg_re2.c +++ b/src/pg_re2.c @@ -18,16 +18,16 @@ PG_MODULE_MAGIC; #endif /* build text datum from span (single palloc) */ -static text * +static Datum span_to_text(re2_span s) { if (s.data) - return cstring_to_text_with_len(s.data, (int)s.len); - return cstring_to_text_with_len("", 0); + return PointerGetDatum(cstring_to_text_with_len(s.data, (int)s.len)); + return PointerGetDatum(cstring_to_text_with_len("", 0)); } /* build bytea datum from span (single palloc) */ -static bytea * +static Datum span_to_bytea(re2_span s) { size_t len = s.data ? s.len : 0; @@ -36,7 +36,7 @@ span_to_bytea(re2_span s) SET_VARSIZE(result, len + VARHDRSZ); if (len > 0) memcpy(VARDATA(result), s.data, len); - return result; + return PointerGetDatum(result); } static re2_pattern * @@ -87,7 +87,7 @@ pgre2_extract(PG_FUNCTION_ARGS) text *haystack = PG_GETARG_TEXT_PP(0); re2_pattern *pat = compile_arg(PG_GETARG_TEXT_PP(1)); - PG_RETURN_TEXT_P(span_to_text(re2_extract(pat, VARDATA_ANY(haystack), VARSIZE_ANY_EXHDR(haystack)))); + PG_RETURN_DATUM(span_to_text(re2_extract(pat, VARDATA_ANY(haystack), VARSIZE_ANY_EXHDR(haystack)))); } PG_FUNCTION_INFO_V1(pgre2_extractall); @@ -113,7 +113,7 @@ pgre2_extractall(PG_FUNCTION_ARGS) elems = (Datum *)palloc(count * sizeof(Datum)); for (int i = 0; i < count; i++) - elems[i] = PointerGetDatum(span_to_text(spans[i])); + elems[i] = span_to_text(spans[i]); arr = construct_array(elems, count, TEXTOID, -1, false, TYPALIGN_INT); PG_RETURN_ARRAYTYPE_P(arr); @@ -134,7 +134,7 @@ pgre2_regexpextract(PG_FUNCTION_ARGS) if (errbuf[0] != '\0') ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("%s", errbuf))); - PG_RETURN_TEXT_P(span_to_text(s)); + PG_RETURN_DATUM(span_to_text(s)); } PG_FUNCTION_INFO_V1(pgre2_extractgroups); @@ -163,12 +163,215 @@ pgre2_extractgroups(PG_FUNCTION_ARGS) elems = (Datum *)palloc(count * sizeof(Datum)); for (int i = 0; i < count; i++) - elems[i] = PointerGetDatum(span_to_text(spans[i])); + elems[i] = span_to_text(spans[i]); arr = construct_array(elems, count, TEXTOID, -1, false, TYPALIGN_INT); PG_RETURN_ARRAYTYPE_P(arr); } +/* + * Build a 2D ArrayType from element-builder fn applied to spans. + * spans: row-major (match × group). vertical=true builds (match, group); else (group, match). + * Empty result yields a 0-D empty array (postgres can't represent shape (k, 0)). + */ +typedef Datum (*span_to_datum_fn)(re2_span s); + +static ArrayType * +build_groups_2d(re2_span *spans, int matches, int ngroups, bool vertical, span_to_datum_fn build, Oid elemoid) +{ + Datum *elems; + int total = matches * ngroups; + int dims[2]; + int lbs[2] = { 1, 1 }; + + if (total == 0) + return construct_empty_array(elemoid); + + elems = (Datum *)palloc(total * sizeof(Datum)); + + if (vertical) + { + dims[0] = matches; + dims[1] = ngroups; + for (int i = 0; i < total; i++) + elems[i] = build(spans[i]); + } + else + { + dims[0] = ngroups; + dims[1] = matches; + for (int g = 0; g < ngroups; g++) + for (int m = 0; m < matches; m++) + elems[g * matches + m] = build(spans[m * ngroups + g]); + } + + return construct_md_array(elems, NULL, 2, dims, lbs, elemoid, -1, false, TYPALIGN_INT); +} + +static ArrayType * +extractallgroups_common(text *haystack_va, text *pattern, bool vertical, bool as_bytea) +{ + re2_pattern *pat = compile_arg(pattern); + const char *hdata = VARDATA_ANY(haystack_va); + size_t hlen = VARSIZE_ANY_EXHDR(haystack_va); + char errbuf[RE2_ERRBUF_SIZE]; + int matches; + int ngroups; + re2_span *spans; + + errbuf[0] = '\0'; + spans = re2_extract_all_groups(pat, hdata, hlen, &matches, &ngroups, errbuf, sizeof(errbuf)); + + if (errbuf[0] != '\0') + ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("%s", errbuf))); + + if (!spans || matches == 0) + return construct_empty_array(as_bytea ? BYTEAOID : TEXTOID); + + return build_groups_2d(spans, matches, ngroups, vertical, as_bytea ? span_to_bytea : span_to_text, + as_bytea ? BYTEAOID : TEXTOID); +} + +PG_FUNCTION_INFO_V1(pgre2_extractallgroupshorizontal); +Datum +pgre2_extractallgroupshorizontal(PG_FUNCTION_ARGS) +{ + PG_RETURN_ARRAYTYPE_P(extractallgroups_common(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), false, false)); +} + +PG_FUNCTION_INFO_V1(pgre2_extractallgroupsvertical); +Datum +pgre2_extractallgroupsvertical(PG_FUNCTION_ARGS) +{ + PG_RETURN_ARRAYTYPE_P(extractallgroups_common(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), true, false)); +} + +/* + * Escape regex metacharacters per ClickHouse semantics: + * \0 \\ | ( ) ^ $ . [ ] ? * + { : - + * (Slightly differs from re2::RE2::QuoteMeta which uses \xNN for control bytes.) + * Source: ClickHouse/src/Functions/regexpQuoteMeta.cpp + */ +static void * +quotemeta_impl(const char *src, size_t slen) +{ + void *out = palloc(2 * slen + VARHDRSZ); + char *dst = VARDATA(out); + size_t dlen = 0; + + for (size_t i = 0; i < slen; i++) + { + char c = src[i]; + + switch (c) + { + case '\0': + case '\\': + case '|': + case '(': + case ')': + case '^': + case '$': + case '.': + case '[': + case ']': + case '?': + case '*': + case '+': + case '{': + case ':': + case '-': + dst[dlen++] = '\\'; + break; + default: + break; + } + dst[dlen++] = c; + } + SET_VARSIZE(out, dlen + VARHDRSZ); + return out; +} + +PG_FUNCTION_INFO_V1(pgre2_regexpquotemeta); +Datum +pgre2_regexpquotemeta(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + + PG_RETURN_TEXT_P(quotemeta_impl(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input))); +} + +/* splitByRegexp empty-pattern path: each input byte becomes its own element. */ +static ArrayType * +split_chars(const char *hdata, size_t hlen, int max_splits, bool as_bytea) +{ + int n = (max_splits > 0 && (size_t)max_splits < hlen) ? max_splits : (int)hlen; + Datum *elems; + ArrayType *arr; + + if (n == 0) + return construct_empty_array(as_bytea ? BYTEAOID : TEXTOID); + + elems = (Datum *)palloc(n * sizeof(Datum)); + for (int i = 0; i < n; i++) + { + re2_span s = { hdata + i, 1 }; + + elems[i] = as_bytea ? span_to_bytea(s) : span_to_text(s); + } + + arr = construct_array(elems, n, as_bytea ? BYTEAOID : TEXTOID, -1, false, TYPALIGN_INT); + return arr; +} + +/* + * splitByRegexp(haystack, pattern, max_splits=0). Empty pattern splits per byte; + * otherwise re2_split emits substrings between matches. max_splits 0 = unlimited. + */ +static ArrayType * +splitbyregexp_common(text *haystack_va, text *pattern, int max_splits, bool as_bytea) +{ + const char *hdata = VARDATA_ANY(haystack_va); + size_t hlen = VARSIZE_ANY_EXHDR(haystack_va); + size_t plen = VARSIZE_ANY_EXHDR(pattern); + + if (plen == 0) + return split_chars(hdata, hlen, max_splits, as_bytea); + + { + re2_pattern *pat = compile_arg(pattern); + char errbuf[RE2_ERRBUF_SIZE]; + int count; + re2_span *spans; + Datum *elems; + ArrayType *arr; + + errbuf[0] = '\0'; + spans = re2_split(pat, hdata, hlen, max_splits, &count, errbuf, sizeof(errbuf)); + if (errbuf[0] != '\0') + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("%s", errbuf))); + + if (!spans || count == 0) + return construct_empty_array(as_bytea ? BYTEAOID : TEXTOID); + + elems = (Datum *)palloc(count * sizeof(Datum)); + for (int i = 0; i < count; i++) + elems[i] = as_bytea ? span_to_bytea(spans[i]) : span_to_text(spans[i]); + + arr = construct_array(elems, count, as_bytea ? BYTEAOID : TEXTOID, -1, false, TYPALIGN_INT); + return arr; + } +} + +PG_FUNCTION_INFO_V1(pgre2_splitbyregexp); +Datum +pgre2_splitbyregexp(PG_FUNCTION_ARGS) +{ + int max_splits = PG_NARGS() >= 3 && !PG_ARGISNULL(2) ? PG_GETARG_INT32(2) : 0; + + PG_RETURN_ARRAYTYPE_P(splitbyregexp_common(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), max_splits, false)); +} + PG_FUNCTION_INFO_V1(pgre2_replaceregexpone); Datum pgre2_replaceregexpone(PG_FUNCTION_ARGS) @@ -336,7 +539,7 @@ pgre2_extract_bytea(PG_FUNCTION_ARGS) bytea *haystack = PG_GETARG_BYTEA_PP(0); re2_pattern *pat = compile_arg(PG_GETARG_TEXT_PP(1)); - PG_RETURN_BYTEA_P(span_to_bytea(re2_extract(pat, VARDATA_ANY(haystack), VARSIZE_ANY_EXHDR(haystack)))); + PG_RETURN_DATUM(span_to_bytea(re2_extract(pat, VARDATA_ANY(haystack), VARSIZE_ANY_EXHDR(haystack)))); } PG_FUNCTION_INFO_V1(pgre2_extractall_bytea); @@ -362,7 +565,7 @@ pgre2_extractall_bytea(PG_FUNCTION_ARGS) elems = (Datum *)palloc(count * sizeof(Datum)); for (int i = 0; i < count; i++) - elems[i] = PointerGetDatum(span_to_bytea(spans[i])); + elems[i] = span_to_bytea(spans[i]); arr = construct_array(elems, count, BYTEAOID, -1, false, TYPALIGN_INT); PG_RETURN_ARRAYTYPE_P(arr); @@ -383,7 +586,7 @@ pgre2_regexpextract_bytea(PG_FUNCTION_ARGS) if (errbuf[0] != '\0') ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("%s", errbuf))); - PG_RETURN_BYTEA_P(span_to_bytea(s)); + PG_RETURN_DATUM(span_to_bytea(s)); } PG_FUNCTION_INFO_V1(pgre2_extractgroups_bytea); @@ -412,12 +615,44 @@ pgre2_extractgroups_bytea(PG_FUNCTION_ARGS) elems = (Datum *)palloc(count * sizeof(Datum)); for (int i = 0; i < count; i++) - elems[i] = PointerGetDatum(span_to_bytea(spans[i])); + elems[i] = span_to_bytea(spans[i]); arr = construct_array(elems, count, BYTEAOID, -1, false, TYPALIGN_INT); PG_RETURN_ARRAYTYPE_P(arr); } +PG_FUNCTION_INFO_V1(pgre2_extractallgroupshorizontal_bytea); +Datum +pgre2_extractallgroupshorizontal_bytea(PG_FUNCTION_ARGS) +{ + PG_RETURN_ARRAYTYPE_P(extractallgroups_common((text *)PG_GETARG_BYTEA_PP(0), PG_GETARG_TEXT_PP(1), false, true)); +} + +PG_FUNCTION_INFO_V1(pgre2_extractallgroupsvertical_bytea); +Datum +pgre2_extractallgroupsvertical_bytea(PG_FUNCTION_ARGS) +{ + PG_RETURN_ARRAYTYPE_P(extractallgroups_common((text *)PG_GETARG_BYTEA_PP(0), PG_GETARG_TEXT_PP(1), true, true)); +} + +PG_FUNCTION_INFO_V1(pgre2_regexpquotemeta_bytea); +Datum +pgre2_regexpquotemeta_bytea(PG_FUNCTION_ARGS) +{ + bytea *input = PG_GETARG_BYTEA_PP(0); + + PG_RETURN_BYTEA_P(quotemeta_impl(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input))); +} + +PG_FUNCTION_INFO_V1(pgre2_splitbyregexp_bytea); +Datum +pgre2_splitbyregexp_bytea(PG_FUNCTION_ARGS) +{ + int max_splits = PG_NARGS() >= 3 && !PG_ARGISNULL(2) ? PG_GETARG_INT32(2) : 0; + + PG_RETURN_ARRAYTYPE_P(splitbyregexp_common((text *)PG_GETARG_BYTEA_PP(0), PG_GETARG_TEXT_PP(1), max_splits, true)); +} + PG_FUNCTION_INFO_V1(pgre2_replaceregexpone_bytea); Datum pgre2_replaceregexpone_bytea(PG_FUNCTION_ARGS) diff --git a/src/re2_wrapper.cpp b/src/re2_wrapper.cpp index 88de54f..3a6c299 100644 --- a/src/re2_wrapper.cpp +++ b/src/re2_wrapper.cpp @@ -229,6 +229,130 @@ re2_extract_groups(const re2_pattern *pat, const char *text, size_t text_len, in return out; } +re2_span * +re2_extract_all_groups(const re2_pattern *pat, const char *text, size_t text_len, int *match_count, int *ngroups_out, + char *errbuf, size_t errbuf_size) +{ + int ngroups = pat->re.NumberOfCapturingGroups(); + + *match_count = 0; + *ngroups_out = ngroups; + + if (ngroups == 0) + { + snprintf(errbuf, errbuf_size, "pattern has no capturing groups"); + return NULL; + } + + errbuf[0] = '\0'; + + re2::StringPiece input(text, text_len); + std::vector spans; + std::vector sub(ngroups + 1); + size_t pos = 0; + + try + { + while (pos <= text_len) + { + if (!pat->re.Match(input, pos, text_len, re2::RE2::UNANCHORED, sub.data(), ngroups + 1)) + break; + + for (int g = 1; g <= ngroups; g++) + { + re2::StringPiece &sp = sub[g]; + re2_span s; + s.data = sp.data(); + s.len = sp.data() ? sp.size() : 0; + spans.push_back(s); + } + + size_t match_end = (sub[0].data() - text) + sub[0].size(); + pos = match_end > pos ? match_end : pos + 1; + } + } + catch (std::bad_alloc &) + { + snprintf(errbuf, errbuf_size, "out of memory"); + return NULL; + } + + if (spans.empty()) + return NULL; + + re2_span *out = (re2_span *)palloc_extended(spans.size() * sizeof(re2_span), MCXT_ALLOC_NO_OOM); + if (!out) + { + snprintf(errbuf, errbuf_size, "out of memory"); + return NULL; + } + memcpy(out, spans.data(), spans.size() * sizeof(re2_span)); + *match_count = (int)(spans.size() / ngroups); + return out; +} + +re2_span * +re2_split(const re2_pattern *pat, const char *text, size_t text_len, int max_splits, int *count, char *errbuf, + size_t errbuf_size) +{ + std::vector spans; + + errbuf[0] = '\0'; + *count = 0; + + try + { + re2::StringPiece input(text, text_len); + size_t pos = 0; + int splits = 0; + bool done = false; + + while (!done) + { + if (max_splits > 0 && splits >= max_splits) + break; + + re2::StringPiece m; + if (!pat->re.Match(input, pos, text_len, re2::RE2::UNANCHORED, &m, 1) || m.size() == 0) + { + re2_span s; + s.data = text + pos; + s.len = text_len - pos; + spans.push_back(s); + done = true; + } + else + { + size_t match_start = m.data() - text; + re2_span s; + s.data = text + pos; + s.len = match_start - pos; + spans.push_back(s); + pos = match_start + m.size(); + splits++; + } + } + } + catch (std::bad_alloc &) + { + snprintf(errbuf, errbuf_size, "out of memory"); + return NULL; + } + + if (spans.empty()) + return NULL; + + re2_span *out = (re2_span *)palloc_extended(spans.size() * sizeof(re2_span), MCXT_ALLOC_NO_OOM); + if (!out) + { + snprintf(errbuf, errbuf_size, "out of memory"); + return NULL; + } + memcpy(out, spans.data(), spans.size() * sizeof(re2_span)); + *count = (int)spans.size(); + return out; +} + static bool validate_rewrite(const re2_pattern *pat, const char *repl, size_t repl_len, char *errbuf, size_t errbuf_size) { diff --git a/src/re2_wrapper.h b/src/re2_wrapper.h index bab91f4..68bb937 100644 --- a/src/re2_wrapper.h +++ b/src/re2_wrapper.h @@ -37,6 +37,21 @@ extern "C" re2_span *re2_extract_groups(const re2_pattern *pat, const char *text, size_t text_len, int *count, char *errbuf, size_t errbuf_size); + /* + * Returns flat row-major array of match_count * ngroups spans (group #0 excluded). + * Sets *match_count and *ngroups_out. On error sets errbuf, returns NULL. + * If pattern has no capture groups, sets errbuf accordingly. + */ + re2_span *re2_extract_all_groups(const re2_pattern *pat, const char *text, size_t text_len, int *match_count, + int *ngroups_out, char *errbuf, size_t errbuf_size); + + /* + * Splits text by pattern. Returns spans of substrings between matches. + * If max_splits > 0, caps the number of returned tokens at max_splits. + */ + re2_span *re2_split(const re2_pattern *pat, const char *text, size_t text_len, int max_splits, int *count, + char *errbuf, size_t errbuf_size); + /* returns palloc'd varlena, caller casts to text* or bytea* */ void *re2_replace_one(const re2_pattern *pat, const char *text, size_t text_len, const char *repl, size_t repl_len, char *errbuf, size_t errbuf_size); diff --git a/test/expected/re2ch.out b/test/expected/re2ch.out index 9588e47..ab67d29 100644 --- a/test/expected/re2ch.out +++ b/test/expected/re2ch.out @@ -174,14 +174,12 @@ SELECT re2regexpextract('100', NULL, 1) IS NULL AS re_null2; (1 row) -- regexpextract errors -\set ON_ERROR_STOP off SELECT re2regexpextract('100-200', '(\d+)-(\d+)', 3); -- out of range ERROR: group index 3 out of range [0, 2] SELECT re2regexpextract('100-200', '(\d+)-(\d+)', -1); -- negative ERROR: group index -1 out of range [0, 2] SELECT re2regexpextract('100-200', '\d+-\d+', 1); -- no groups + index 1 ERROR: group index 1 out of range [0, 0] -\set ON_ERROR_STOP on -- extractgroups SELECT re2extractgroups('hello world', '(\w+) (\w+)'); re2extractgroups @@ -208,10 +206,180 @@ SELECT re2extractgroups(NULL, '(\d+)') IS NULL AS eg_null; (1 row) -- extractgroups errors -\set ON_ERROR_STOP off SELECT re2extractgroups('hello', '\w+'); -- no capture groups ERROR: pattern has no capturing groups -\set ON_ERROR_STOP on +-- extractallgroupsvertical / horizontal +SELECT re2extractallgroupsvertical('abc=111, def=222, ghi=333', '("[^"]+"|\w+)=("[^"]+"|\w+)'); + re2extractallgroupsvertical +--------------------------------- + {{abc,111},{def,222},{ghi,333}} +(1 row) + +SELECT re2extractallgroupshorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\w+)=("[^"]+"|\w+)'); + re2extractallgroupshorizontal +------------------------------- + {{abc,def,ghi},{111,222,333}} +(1 row) + +SELECT re2extractallgroupsvertical('2024-01-15 2025-06-30', '(\d{4})-(\d{2})-(\d{2})'); + re2extractallgroupsvertical +----------------------------- + {{2024,01,15},{2025,06,30}} +(1 row) + +SELECT re2extractallgroupshorizontal('2024-01-15 2025-06-30', '(\d{4})-(\d{2})-(\d{2})'); + re2extractallgroupshorizontal +------------------------------- + {{2024,2025},{01,06},{15,30}} +(1 row) + +SELECT re2extractallgroupsvertical('no match', '(\d+)'); -- empty array + re2extractallgroupsvertical +----------------------------- + {} +(1 row) + +SELECT re2extractallgroupshorizontal('no match', '(\d+)'); -- empty array + re2extractallgroupshorizontal +------------------------------- + {} +(1 row) + +SELECT re2extractallgroupsvertical(NULL, '(\d+)') IS NULL AS eav_null; + eav_null +---------- + t +(1 row) + +SELECT re2extractallgroupshorizontal(NULL, '(\d+)') IS NULL AS eah_null; + eah_null +---------- + t +(1 row) + +-- extractallgroups errors +SELECT re2extractallgroupsvertical('hello', '\w+'); -- no capture groups +ERROR: pattern has no capturing groups +SELECT re2extractallgroupshorizontal('hello', '\w+'); -- no capture groups +ERROR: pattern has no capturing groups +-- regexpquotemeta +SELECT re2regexpquotemeta('Hello. [World]? (Yes)*'); + re2regexpquotemeta +------------------------------- + Hello\. \[World\]\? \(Yes\)\* +(1 row) + +SELECT re2regexpquotemeta('a+b*c?'); + re2regexpquotemeta +-------------------- + a\+b\*c\? +(1 row) + +SELECT re2regexpquotemeta('plain text'); -- no metas + re2regexpquotemeta +-------------------- + plain text +(1 row) + +SELECT re2regexpquotemeta(''); -- empty + re2regexpquotemeta +-------------------- + +(1 row) + +SELECT re2regexpquotemeta('a-b:c{d}|e^f$g\h'); + re2regexpquotemeta +------------------------- + a\-b\:c\{d}\|e\^f\$g\\h +(1 row) + +-- escaped pattern roundtrips: re2match(s, re2regexpquotemeta(s)) is true +SELECT re2match('1+1=2', re2regexpquotemeta('1+1')); + re2match +---------- + t +(1 row) + +SELECT re2regexpquotemeta(NULL) IS NULL AS rqm_null; + rqm_null +---------- + t +(1 row) + +-- splitbyregexp +SELECT re2splitbyregexp('a12bc23de345f', '\d+'); -- digit splitter + re2splitbyregexp +------------------ + {a,bc,de,f} +(1 row) + +SELECT re2splitbyregexp('abcde', ''); -- empty pattern: per char + re2splitbyregexp +------------------ + {a,b,c,d,e} +(1 row) + +SELECT re2splitbyregexp('a,b,c', ','); -- char delimiter + re2splitbyregexp +------------------ + {a,b,c} +(1 row) + +SELECT re2splitbyregexp(',a,b,', ','); -- leading/trailing splits + re2splitbyregexp +------------------ + {"",a,b,""} +(1 row) + +SELECT re2splitbyregexp('abc', ','); -- no match: whole string + re2splitbyregexp +------------------ + {abc} +(1 row) + +SELECT re2splitbyregexp('', ','); -- empty haystack + re2splitbyregexp +------------------ + {""} +(1 row) + +SELECT re2splitbyregexp('', ''); -- both empty + re2splitbyregexp +------------------ + {} +(1 row) + +SELECT re2splitbyregexp('a,b,c,d', ',', 2); -- max_substrings cap + re2splitbyregexp +------------------ + {a,b} +(1 row) + +SELECT re2splitbyregexp('a,b,c,d', ',', 0); -- 0 = unlimited + re2splitbyregexp +------------------ + {a,b,c,d} +(1 row) + +SELECT re2splitbyregexp('abcdef', '', 3); -- empty pat + cap + re2splitbyregexp +------------------ + {a,b,c} +(1 row) + +-- CH: zero-length match (e.g. 'a*') treated as no-match +SELECT re2splitbyregexp('foo', 'x*'); + re2splitbyregexp +------------------ + {foo} +(1 row) + +SELECT re2splitbyregexp(NULL, ',') IS NULL AS spr_null; + spr_null +---------- + t +(1 row) + -- replaceregexpone SELECT re2replaceregexpone('Hello', 'l', 'x'); -- first only re2replaceregexpone @@ -264,10 +432,8 @@ SELECT re2replaceregexpone(NULL, '\d+', 'x') IS NULL AS rp1_null; (1 row) -- replaceregexpone error: invalid backref -\set ON_ERROR_STOP off SELECT re2replaceregexpone('Hello', 'l', '\1'); -- \1: backref beyond 0 group(s) ERROR: \1: backref beyond 0 group(s) -\set ON_ERROR_STOP on -- replaceregexpall SELECT re2replaceregexpall('Hello', 'l', 'x'); -- all occurrences re2replaceregexpall @@ -498,10 +664,8 @@ SELECT re2multimatchallindices('test', '\d+', '[A-Z]+'); (1 row) -- invalid pattern -\set ON_ERROR_STOP off SELECT re2match('hello', '[invalid'); ERROR: invalid RE2 pattern: missing ]: [invalid -\set ON_ERROR_STOP on -- ==== bytea overloads (zero-byte handling, CH tests 01083/01085) ==== -- match with \0 in haystack (CH: match('\0 key="v" ', 'key="(.*?)"') -> 1) SELECT re2match('\x00'::bytea || ' key="v" '::bytea, 'key="(.*?)"'); @@ -583,3 +747,30 @@ SELECT re2multimatchany('a'::bytea || '\x00'::bytea || 'key="v"'::bytea, 'key', t (1 row) +-- extractallgroups with \0 +SELECT re2extractallgroupsvertical('a'::bytea || '\x00'::bytea || 'k1=v1 k2=v2'::bytea, '(\w+)=(\w+)'); + re2extractallgroupsvertical +----------------------------------------------- + {{"\\x6b31","\\x7631"},{"\\x6b32","\\x7632"}} +(1 row) + +SELECT re2extractallgroupshorizontal('a'::bytea || '\x00'::bytea || 'k1=v1 k2=v2'::bytea, '(\w+)=(\w+)'); + re2extractallgroupshorizontal +----------------------------------------------- + {{"\\x6b31","\\x6b32"},{"\\x7631","\\x7632"}} +(1 row) + +-- regexpquotemeta with \0 +SELECT re2regexpquotemeta('a'::bytea || '\x00'::bytea || '.b'::bytea); + re2regexpquotemeta +-------------------- + \x615c005c2e62 +(1 row) + +-- splitbyregexp with \0 +SELECT re2splitbyregexp('a'::bytea || '\x00'::bytea || 'b'::bytea || '\x00'::bytea || 'c'::bytea, '\x00'); + re2splitbyregexp +--------------------------- + {"\\x61","\\x62","\\x63"} +(1 row) + diff --git a/test/sql/re2ch.sql b/test/sql/re2ch.sql index 160c706..4112639 100644 --- a/test/sql/re2ch.sql +++ b/test/sql/re2ch.sql @@ -39,11 +39,9 @@ SELECT re2regexpextract(NULL, '(\d+)', 1) IS NULL AS re_null1; SELECT re2regexpextract('100', NULL, 1) IS NULL AS re_null2; -- regexpextract errors -\set ON_ERROR_STOP off SELECT re2regexpextract('100-200', '(\d+)-(\d+)', 3); -- out of range SELECT re2regexpextract('100-200', '(\d+)-(\d+)', -1); -- negative SELECT re2regexpextract('100-200', '\d+-\d+', 1); -- no groups + index 1 -\set ON_ERROR_STOP on -- extractgroups SELECT re2extractgroups('hello world', '(\w+) (\w+)'); @@ -52,9 +50,46 @@ SELECT re2extractgroups('no match', '(\d{4})-(\d{2})-(\d{2})'); -- no match: em SELECT re2extractgroups(NULL, '(\d+)') IS NULL AS eg_null; -- extractgroups errors -\set ON_ERROR_STOP off SELECT re2extractgroups('hello', '\w+'); -- no capture groups -\set ON_ERROR_STOP on + +-- extractallgroupsvertical / horizontal +SELECT re2extractallgroupsvertical('abc=111, def=222, ghi=333', '("[^"]+"|\w+)=("[^"]+"|\w+)'); +SELECT re2extractallgroupshorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\w+)=("[^"]+"|\w+)'); +SELECT re2extractallgroupsvertical('2024-01-15 2025-06-30', '(\d{4})-(\d{2})-(\d{2})'); +SELECT re2extractallgroupshorizontal('2024-01-15 2025-06-30', '(\d{4})-(\d{2})-(\d{2})'); +SELECT re2extractallgroupsvertical('no match', '(\d+)'); -- empty array +SELECT re2extractallgroupshorizontal('no match', '(\d+)'); -- empty array +SELECT re2extractallgroupsvertical(NULL, '(\d+)') IS NULL AS eav_null; +SELECT re2extractallgroupshorizontal(NULL, '(\d+)') IS NULL AS eah_null; + +-- extractallgroups errors +SELECT re2extractallgroupsvertical('hello', '\w+'); -- no capture groups +SELECT re2extractallgroupshorizontal('hello', '\w+'); -- no capture groups + +-- regexpquotemeta +SELECT re2regexpquotemeta('Hello. [World]? (Yes)*'); +SELECT re2regexpquotemeta('a+b*c?'); +SELECT re2regexpquotemeta('plain text'); -- no metas +SELECT re2regexpquotemeta(''); -- empty +SELECT re2regexpquotemeta('a-b:c{d}|e^f$g\h'); +-- escaped pattern roundtrips: re2match(s, re2regexpquotemeta(s)) is true +SELECT re2match('1+1=2', re2regexpquotemeta('1+1')); +SELECT re2regexpquotemeta(NULL) IS NULL AS rqm_null; + +-- splitbyregexp +SELECT re2splitbyregexp('a12bc23de345f', '\d+'); -- digit splitter +SELECT re2splitbyregexp('abcde', ''); -- empty pattern: per char +SELECT re2splitbyregexp('a,b,c', ','); -- char delimiter +SELECT re2splitbyregexp(',a,b,', ','); -- leading/trailing splits +SELECT re2splitbyregexp('abc', ','); -- no match: whole string +SELECT re2splitbyregexp('', ','); -- empty haystack +SELECT re2splitbyregexp('', ''); -- both empty +SELECT re2splitbyregexp('a,b,c,d', ',', 2); -- max_substrings cap +SELECT re2splitbyregexp('a,b,c,d', ',', 0); -- 0 = unlimited +SELECT re2splitbyregexp('abcdef', '', 3); -- empty pat + cap +-- CH: zero-length match (e.g. 'a*') treated as no-match +SELECT re2splitbyregexp('foo', 'x*'); +SELECT re2splitbyregexp(NULL, ',') IS NULL AS spr_null; -- replaceregexpone SELECT re2replaceregexpone('Hello', 'l', 'x'); -- first only @@ -69,9 +104,7 @@ SELECT re2replaceregexpone(',,1', '^[,]*|[,]*$', ''); SELECT re2replaceregexpone(NULL, '\d+', 'x') IS NULL AS rp1_null; -- replaceregexpone error: invalid backref -\set ON_ERROR_STOP off SELECT re2replaceregexpone('Hello', 'l', '\1'); -- \1: backref beyond 0 group(s) -\set ON_ERROR_STOP on -- replaceregexpall SELECT re2replaceregexpall('Hello', 'l', 'x'); -- all occurrences @@ -129,9 +162,7 @@ SELECT re2multimatchallindices('hello world', 'hello', '\d+', 'world', 'o'); SELECT re2multimatchallindices('test', '\d+', '[A-Z]+'); -- invalid pattern -\set ON_ERROR_STOP off SELECT re2match('hello', '[invalid'); -\set ON_ERROR_STOP on -- ==== bytea overloads (zero-byte handling, CH tests 01083/01085) ==== @@ -162,3 +193,13 @@ SELECT re2countmatches('a'::bytea || '\x00'::bytea || 'b'::bytea || '\x00'::byte -- multimatchany with \0 haystack SELECT re2multimatchany('a'::bytea || '\x00'::bytea || 'key="v"'::bytea, 'key', 'nope'); + +-- extractallgroups with \0 +SELECT re2extractallgroupsvertical('a'::bytea || '\x00'::bytea || 'k1=v1 k2=v2'::bytea, '(\w+)=(\w+)'); +SELECT re2extractallgroupshorizontal('a'::bytea || '\x00'::bytea || 'k1=v1 k2=v2'::bytea, '(\w+)=(\w+)'); + +-- regexpquotemeta with \0 +SELECT re2regexpquotemeta('a'::bytea || '\x00'::bytea || '.b'::bytea); + +-- splitbyregexp with \0 +SELECT re2splitbyregexp('a'::bytea || '\x00'::bytea || 'b'::bytea || '\x00'::bytea || 'c'::bytea, '\x00');