diff --git a/docs/source/contributor-guide/expression-audits/index.md b/docs/source/contributor-guide/expression-audits/index.md index c891a77b84..18b124cba1 100644 --- a/docs/source/contributor-guide/expression-audits/index.md +++ b/docs/source/contributor-guide/expression-audits/index.md @@ -40,6 +40,7 @@ map_funcs math_funcs misc_funcs predicate_funcs +string_funcs struct_funcs url_funcs window_funcs diff --git a/docs/source/contributor-guide/expression-audits/string_funcs.md b/docs/source/contributor-guide/expression-audits/string_funcs.md new file mode 100644 index 0000000000..cd58089c3a --- /dev/null +++ b/docs/source/contributor-guide/expression-audits/string_funcs.md @@ -0,0 +1,253 @@ + + +# string_funcs Expression Audits + +> Audit notes for expressions in this category that have been audited. Absence of an entry means the expression has not been audited yet, not that it is unsupported. See the user guide [Spark Expression Support] for current support status. + +## ascii + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringType -> IntegerType`; `nullSafeEval` returns `codePointAt(0)` of the first char, or `0` for the empty string. Wired via `CometScalarFunction("ascii")` and resolved to DataFusion `ascii` (`chars().next() as i32`); first-code-point semantics match for ASCII, BMP, and supplementary code points. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; behaviour unchanged for `UTF8_BINARY`. Comet does not propagate collation, so non-default collations may diverge silently (https://github.com/apache/datafusion-comet/issues/4496). + +## bit_length + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numBytes * 8` for strings and `.length * 8` for binary. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). +- Known limitation: wired as a raw `CometScalarFunction("bit_length")` with no `BinaryType` guard. DataFusion's `BitLengthFunc` signature only accepts string types, so `bit_length()` execute-fails on the native side instead of falling back cleanly (https://github.com/apache/datafusion-comet/issues/4464). + +## btrim + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimBoth` is `RuntimeReplaceable` and rewritten to `StringTrim(srcStr, trimStr)` before serde runs. Support is provided by the `trim` entry; no dedicated serde registration. +- Spark 4.0.1 (audited 2026-05-27): `StringTrim` (the rewrite target) routes through `CollationSupport.StringTrim.exec` and uses `StringTypeNonCSAICollation(supportsTrimCollation = true)`; semantics unchanged for `UTF8_BINARY`. Non-default collations may diverge in Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## char + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `Chr(LongType) -> StringType`; `lon < 0` returns `""`, else `((lon & 0xFF) as char).toString` (so `chr(256)` and `chr(0)` both return `\u0000`). +- Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. Resolves natively to `datafusion_spark::function::string::char::CharFunc`, which mirrors Spark's negative-input and `& 0xFF` semantics. + +## char_length + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. + +## character_length + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. + +## chr + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Chr`. Same support as `char`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Chr`. + +## concat_ws + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `Seq[Expression] -> StringType`; NULL separator yields NULL, NULL element values are skipped, children can be `StringType` or `ArrayType(StringType)`. Comet serde rewrites a NULL-literal separator to a NULL of the result type and bails out on all-foldable inputs so Spark's `ConstantFolding` handles them; otherwise delegates to DataFusion `concat_ws`. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation` / `AbstractArrayType`; `dataType` becomes `children.head.dataType` (collation-derived). Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## contains + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.contains` on `StringType`; the parser routes `(BinaryType, BinaryType)` to `BinaryPredicate`, so Comet only ever sees the String form. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Contains.exec(..., collationId)`; behaviour identical for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## decode + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringDecode(bin, charset)` evaluated directly; invalid sequences silently substitute replacement characters via `new String(bytes, charset)`. +- Spark 4.0.1 (audited 2026-05-27): refactored to `RuntimeReplaceable` whose `replacement` is a `StaticInvoke(StringDecode.decode, bin, charset, legacyCharsets, legacyErrorAction)`; the 4-arg form raises on malformed input unless legacy flags are set. +- Known limitations: Comet handles `decode` via `CommonStringExprs.stringDecode` from the version shims (no `CometExpressionSerde[StringDecode]` registration, so the function does not surface in the auto-generated compatibility docs: https://github.com/apache/datafusion-comet/issues/4466). Only literal `charset = 'utf-8'` (case-insensitive) is supported; everything else falls back. The Spark 4.0 `legacyCharsets` / `legacyErrorAction` flags are ignored: Comet always lowers to `Cast(bin, StringType, TRY)`, so invalid UTF-8 yields NULL where Spark 3.x substitutes replacement characters and Spark 4.0 (non-legacy) raises (https://github.com/apache/datafusion-comet/issues/4465). + +## endswith + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.endsWith` on `StringType`; binary form routed to `BinaryPredicate` before Comet. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.EndsWith.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## initcap + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `string.toLowerCase.toTitleCase` on `UTF8String`; word boundary is Java `Character.isWhitespace`. Comet routes to DataFusion `initcap`, which splits on `!is_alphanumeric()` (hyphens, apostrophes, and punctuation all split words), so Comet is unconditionally `Incompatible` (https://github.com/apache/datafusion-comet/issues/1052). +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.InitCap.exec` (collation- and ICU-aware) and propagates `child.dataType`. Comet ignores collation; 3.x divergences persist plus collation/ICU mismatches (https://github.com/apache/datafusion-comet/issues/4496). + +## instr + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringInstr(str, substr) -> IntegerType`; returns `string.indexOf(sub, 0) + 1` (1-based, 0 when not found, 1 on empty substring). Resolves to DataFusion `strpos` (alias `instr`) with matching semantics. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringInstr.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## lcase + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Lower`. Same support as `lower`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Lower`. + +## left + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `RuntimeReplaceable` with `replacement = Substring(str, Literal(1), len)`; accepts `StringType` or `BinaryType` plus `IntegerType`. Comet serde rewrites to a `Substring` proto with `start=1, len=lenValue`. `getSupportLevel` declares `Unsupported` for non-literal `len` so the dispatcher falls back uniformly. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with `StringTypeWithCollation`; behaviour unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## len + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. + +## length + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numChars` for strings and `.length` for binary. `BinaryType` input falls back via `Unsupported` (DataFusion's `character_length` accepts string types only). +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; semantics unchanged. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## lower + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. JVM default-locale `toLowerCase` on `UTF8String`. Comet routes to DataFusion `lower` (Rust Unicode default case mapping, no locale awareness) and is unconditionally `Incompatible`; users opt in via the standard `spark.comet.expression.Lower.allowIncompatible=true`. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Lower.exec(v, collationId, useICU)` with `SQLConf.ICU_CASE_MAPPINGS_ENABLED`; `inputTypes` widened to `StringTypeWithCollation`. Comet ignores collation and ICU mode, so non-default collations or `ICU_CASE_MAPPINGS_ENABLED=true` diverge even after opting in (https://github.com/apache/datafusion-comet/issues/2190). + +## lpad + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringLPad(str, len, pad) -> StringType`; `len <= 0` returns the empty string, empty `pad` returns `str` unchanged, NULL inputs propagate. Comet serde requires `str` to be a column and `pad` to be a literal; otherwise falls back. +- Spark 4.0.1 (audited 2026-05-27): `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`; `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`. Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). +- Known limitation: `lpad(, ...)` is rewritten by Spark to `BinaryPad / StaticInvoke(ByteArray.lpad)` before serde runs and always falls back to Spark. + +## ltrim + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimLeft` extends `String2TrimExpression`; no-arg form strips ASCII space `0x20` only. The two-arg parser form `ltrim(trimStr, srcStr)` is swapped to `(srcStr, Option(trimStr))` by Spark's secondary constructor, so children match DataFusion `ltrim(str, chars)`. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrimLeft.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## octet_length + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numBytes` for strings and `.length` for binary. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). +- Known limitation: wired as a raw `CometScalarFunction("octet_length")` with no `BinaryType` guard. DataFusion's `OctetLengthFunc` signature only accepts string types, so `octet_length()` execute-fails on the native side instead of falling back cleanly (https://github.com/apache/datafusion-comet/issues/4464). + +## regexp_replace + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `RegExpReplace(subject, regexp, rep, pos)` with foldable `pos > 0`; uses Java `Pattern`. Comet supports only `pos = 1` (other offsets fall back) and injects a `'g'` flag because DataFusion's `regexp_replace` stops at the first match by default. +- Spark 4.0.1 (audited 2026-05-27): adds raw-string literal support at the parser level and `nullIntolerant: Boolean = true`; runtime semantics unchanged. +- Known limitation: regex semantics differ (Rust `regex` crate vs Java `Pattern`); `RegExp.isSupportedPattern` currently returns `false` for every pattern, so the path always requires `spark.comet.expression.regexp.allowIncompatible=true`. + +## repeat + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringRepeat(str, times)` with `nullSafeEval(s, n) = s.repeat(n)`; `UTF8String.repeat` returns the empty string for `n <= 0`. Comet casts `times` to `LongType` and delegates to DataFusion `repeat`, which mirrors Spark for negative counts. +- Spark 4.0.1 (audited 2026-05-27): adds `nullIntolerant: Boolean` field; `dataType` becomes `str.dataType` (collation-tracking). Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## replace + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringReplace(src, search, replace)`; when `search` is empty, Spark returns `src` unchanged (short-circuit on `search.numBytes == 0`). DataFusion `replace` instead inserts `replace` between every character, so `CometStringReplace.getSupportLevel` marks `Incompatible(Some(reason))` when `search` is a literal empty string and falls back to Spark by default (https://github.com/apache/datafusion-comet/issues/4497). +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringReplace.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## right + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `RuntimeReplaceable` with `replacement = If(IsNull(str), null, If(len <= 0, "", Substring(str, -len, len)))`; accepts `StringType` plus `IntegerType`. Comet serde rewrites positive `len` to a `Substring` proto with `start=-len, len=len`; for `len <= 0` it builds an `If(IsNull(str), null, "")` proto chain to preserve NULL propagation. `getSupportLevel` declares `Unsupported` for non-literal `len` so the dispatcher falls back uniformly. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with collation; uses `UnaryMinus(len, failOnError = false)` to avoid integer-overflow exceptions on `len = Int.MinValue`. Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## rpad + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringRPad(str, len, pad) -> StringType`; same edge-case behaviour as `lpad` (negative len, empty pad, NULL propagation). Comet serde requires column `str` and literal `pad`. +- Spark 4.0.1 (audited 2026-05-27): same evolution as `lpad`; default-pad literal type tightened; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). +- Known limitation: same `BinaryPad / StaticInvoke` rewrite as `lpad` causes `rpad(, ...)` to fall back. + +## rtrim + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimRight` extends `String2TrimExpression`; semantically symmetric to `ltrim`. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrimRight.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## space + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringSpace(IntegerType) -> StringType`; negative input yields the empty string. Resolves natively to `datafusion_spark::function::string::space::SparkSpace`. +- Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `nullIntolerant: Boolean` override. + +## split + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringSplit(str, regex, limit)`; `limit > 0` permits at most `limit-1` splits, `limit <= 0` is unlimited. Comet registers `split` as a custom UDF (`native/spark-expr/src/string_funcs/split.rs`) using the Rust `regex` crate, and is unconditionally `Incompatible` due to regex-engine differences. +- Spark 4.0.1 (audited 2026-05-27): wraps the regex via `CollationSupport.collationAwareRegex` and changes `dataType` to `ArrayType(str.dataType, ...)`. Comet does not honour collation flags (https://github.com/apache/datafusion-comet/issues/4496). + +## startswith + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.startsWith` on `StringType`; binary form routed to `BinaryPredicate`. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StartsWith.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## substr + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Substring`. Same support as `substring`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Substring`. + +## substring + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `TernaryExpression`; two-arg form defaults `len = Integer.MAX_VALUE`; supports `StringType` and `BinaryType`. Comet serializes to a dedicated `Substring` proto. `getSupportLevel` declares `Unsupported` when either `pos` or `len` is not a `Literal` so the dispatcher falls back uniformly. +- Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with `StringTypeWithCollation`; semantics unchanged for `UTF8_BINARY`. Native `SubstringExpr` implements Spark's negative-start clamping and is exercised against ASCII, multibyte UTF-8, emoji, decomposed and Telugu inputs. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## substring_index + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `TernaryExpression(StringType, StringType, IntegerType) -> StringType`. Comet casts `count` to `LongType` and delegates to DataFusion's `substr_index` UDF (alias `substring_index`). +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.SubstringIndex.exec` and propagates `strExpr.dataType`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## translate + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringTranslate(src, from, to)`; `UTF8String.translate(dict)` is code-point based, and any character mapped explicitly to U+0000 in `to` is also deleted. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTranslate.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). +- Known divergence: DataFusion's `translate` is grapheme-based (Spark uses code points), and does not delete characters mapped to U+0000 in `to`. Currently the support level is `Compatible` (https://github.com/apache/datafusion-comet/issues/4463). + +## trim + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrim` no-arg form strips ASCII space `0x20` only (matches DataFusion `btrim`'s default); two-arg form's children are `(srcStr, trimStr)` after Spark's secondary-constructor swap. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrim.exec` and uses `StringTypeNonCSAICollation`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + +## ucase + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): registry alias of `Upper`. Same support as `upper`. +- Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Upper`. + +## upper + +- Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. +- Spark 3.5.8 (audited 2026-05-27): baseline. JVM default-locale `toUpperCase` on `UTF8String`. Comet routes to DataFusion `upper` (Rust Unicode default case mapping, no locale awareness) and is unconditionally `Incompatible`; users opt in via the standard `spark.comet.expression.Upper.allowIncompatible=true`. +- Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Upper.exec(v, collationId, useICU)` with `SQLConf.ICU_CASE_MAPPINGS_ENABLED`. Comet does not propagate collation or ICU mode; non-default collations or `ICU_CASE_MAPPINGS_ENABLED=true` diverge even after opting in (https://github.com/apache/datafusion-comet/issues/2190). + +[Spark Expression Support]: ../../user-guide/latest/expressions.md diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 2c49114dd8..07a4ebca3f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -203,7 +203,6 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[StringSplit] -> CometStringSplit, classOf[StringTranslate] -> CometScalarFunction("translate"), classOf[StringTrim] -> CometScalarFunction("trim"), - classOf[StringTrimBoth] -> CometScalarFunction("btrim"), classOf[StringTrimLeft] -> CometScalarFunction("ltrim"), classOf[StringTrimRight] -> CometScalarFunction("rtrim"), classOf[Left] -> CometLeft, diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index c9fc8a29fb..30146eea6d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -349,18 +349,22 @@ object CometRLike extends CometExpressionSerde[RLike] { } } +private object PadReasons { + val literalStrReason = "Scalar values are not supported for the `str` argument." + val nonLiteralPadReason = "Only scalar values are supported for the `pad` argument." +} + object CometStringRPad extends CometExpressionSerde[StringRPad] { - override def getUnsupportedReasons(): Seq[String] = Seq( - "Scalar values are not supported for the `str` argument." + - " Only scalar values are supported for the `pad` argument.") + override def getUnsupportedReasons(): Seq[String] = + Seq(PadReasons.literalStrReason, PadReasons.nonLiteralPadReason) override def getSupportLevel(expr: StringRPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { - return Unsupported(Some("Scalar values are not supported for the str argument")) + return Unsupported(Some(PadReasons.literalStrReason)) } if (!expr.pad.isInstanceOf[Literal]) { - return Unsupported(Some("Only scalar values are supported for the pad argument")) + return Unsupported(Some(PadReasons.nonLiteralPadReason)) } Compatible() } @@ -380,16 +384,15 @@ object CometStringRPad extends CometExpressionSerde[StringRPad] { object CometStringLPad extends CometExpressionSerde[StringLPad] { - override def getUnsupportedReasons(): Seq[String] = Seq( - "Scalar values are not supported for the `str` argument." + - " Only scalar values are supported for the `pad` argument.") + override def getUnsupportedReasons(): Seq[String] = + Seq(PadReasons.literalStrReason, PadReasons.nonLiteralPadReason) override def getSupportLevel(expr: StringLPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { - return Unsupported(Some("Scalar values are not supported for the str argument")) + return Unsupported(Some(PadReasons.literalStrReason)) } if (!expr.pad.isInstanceOf[Literal]) { - return Unsupported(Some("Only scalar values are supported for the pad argument")) + return Unsupported(Some(PadReasons.nonLiteralPadReason)) } Compatible() } diff --git a/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql index 0461fce735..10c188b441 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql @@ -15,8 +15,8 @@ -- specific language governing permissions and limitations -- under the License. --- Test lower() with case conversion enabled (happy path) --- Config: spark.comet.caseConversion.enabled=true +-- Test lower() with the standard allowIncompatible opt-in (happy path) +-- Config: spark.comet.expression.Lower.allowIncompatible=true statement CREATE TABLE test_lower_enabled(s string) USING parquet diff --git a/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql b/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql index 83d0ceaebb..c27d93de62 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql @@ -21,7 +21,7 @@ CREATE TABLE test_lpad(s string, len int, pad string) USING parquet statement INSERT INTO test_lpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x') -query expect_fallback(Only scalar values are supported for the pad argument) +query expect_fallback(Only scalar values are supported for the `pad` argument) SELECT lpad(s, len, pad) FROM test_lpad query @@ -32,5 +32,5 @@ query SELECT lpad(s, 5, 'x') FROM test_lpad -- literal + literal + literal -query expect_fallback(Scalar values are not supported for the str argument) +query expect_fallback(Scalar values are not supported for the `str` argument) SELECT lpad('hi', 5, 'x'), lpad('hello', 3, 'x'), lpad('', 3, 'a'), lpad(NULL, 5, 'x') diff --git a/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql b/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql index 48d3fb6cec..4ea06c3b23 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql @@ -21,7 +21,7 @@ CREATE TABLE test_rpad(s string, len int, pad string) USING parquet statement INSERT INTO test_rpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x') -query expect_fallback(Only scalar values are supported for the pad argument) +query expect_fallback(Only scalar values are supported for the `pad` argument) SELECT rpad(s, len, pad) FROM test_rpad query @@ -32,5 +32,5 @@ query SELECT rpad(s, 5, 'x') FROM test_rpad -- literal + literal + literal -query expect_fallback(Scalar values are not supported for the str argument) +query expect_fallback(Scalar values are not supported for the `str` argument) SELECT rpad('hi', 5, 'x'), rpad('hello', 3, 'x'), rpad('', 3, 'a'), rpad(NULL, 5, 'x') diff --git a/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql index 95ad265229..e1035ab37f 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql @@ -15,8 +15,8 @@ -- specific language governing permissions and limitations -- under the License. --- Test upper() with case conversion enabled (happy path) --- Config: spark.comet.caseConversion.enabled=true +-- Test upper() with the standard allowIncompatible opt-in (happy path) +-- Config: spark.comet.expression.Upper.allowIncompatible=true statement CREATE TABLE test_upper_enabled(s string) USING parquet diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 83eba85851..d07257442d 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -89,11 +89,11 @@ class CometStringExpressionSuite extends CometTestBase { } else if (isLiteralStr) { checkSparkAnswerAndFallbackReason( sql, - "Scalar values are not supported for the str argument") + "Scalar values are not supported for the `str` argument") } else if (!isLiteralPad) { checkSparkAnswerAndFallbackReason( sql, - "Only scalar values are supported for the pad argument") + "Only scalar values are supported for the `pad` argument") } else { checkSparkAnswerAndOperator(sql) } @@ -261,7 +261,9 @@ class CometStringExpressionSuite extends CometTestBase { } test("Upper and Lower") { - withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") { + withSQLConf( + CometConf.getExprAllowIncompatConfigKey("Upper") -> "true", + CometConf.getExprAllowIncompatConfigKey("Lower") -> "true") { val table = "names" withTable(table) { sql(s"create table $table(id int, name varchar(20)) using parquet") @@ -339,7 +341,7 @@ class CometStringExpressionSuite extends CometTestBase { } test("trim") { - withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") { + withSQLConf(CometConf.getExprAllowIncompatConfigKey("Upper") -> "true") { val table = "test" withTable(table) { sql(s"create table $table(col varchar(20)) using parquet") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala index d7be505161..653ca6fd0b 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala @@ -86,7 +86,10 @@ object CometStringExpressionBenchmark extends CometBenchmarkBase { dir, spark.sql(s"SELECT REPEAT(CAST(value AS STRING), 10) AS c1 FROM $tbl")) - val extraConfigs = Map(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") + val extraConfigs = Map( + CometConf.getExprAllowIncompatConfigKey("Upper") -> "true", + CometConf.getExprAllowIncompatConfigKey("Lower") -> "true", + CometConf.getExprAllowIncompatConfigKey("InitCap") -> "true") stringExpressions.foreach { config => val allConfigs = extraConfigs ++ config.extraCometConfigs