From 66fccbe9d82716ad8dd4089e38021cf3e7cc63d1 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 10 Jun 2026 09:28:01 -0600
Subject: [PATCH 1/2] feat: route structured-text functions through codegen
 dispatcher

Register the CSV / JSON / XPath / XML structured-text functions that previously
fell back to Spark so they stay native via the codegen dispatcher. None have a
native (rust) implementation; they extend Spark's CodegenFallback.

- from_csv, schema_of_csv, schema_of_json, json_object_keys, xpath/xpath_*
- from_xml, to_xml, schema_of_xml (Spark 4.0+ only)

On Spark 3.4/3.5 these are plain expressions, registered directly in the serde
maps. On Spark 4.x they are RuntimeReplaceable and the optimizer rewrites them
to Invoke(evaluator)/StaticInvoke before Comet sees the plan, so they are
dispatched from CometExprShim4x.convertStructuredText, which matches the backing
evaluators by simple name to stay robust across 4.0/4.1/4.2. When the dispatcher
is disabled they fall back to Spark.

Adds CometStructuredTextSuite (XML tests gated to Spark 4.0+). Verified on the
spark-3.4, 3.5, 4.0, 4.1, and 4.2 profiles.
---
 .github/workflows/pr_build_linux.yml          |   1 +
 .github/workflows/pr_build_macos.yml          |   1 +
 docs/source/user-guide/latest/expressions.md  |  35 +++-
 .../apache/comet/serde/QueryPlanSerde.scala   |  21 ++-
 .../scala/org/apache/comet/serde/csv.scala    |  26 +++
 .../scala/org/apache/comet/serde/json.scala   |   6 +-
 .../scala/org/apache/comet/serde/xpath.scala  |  38 ++++
 .../apache/comet/shims/CometExprShim.scala    |   7 +
 .../apache/comet/shims/CometExprShim.scala    |   7 +
 .../apache/comet/shims/CometExprShim.scala    |   7 +
 .../apache/comet/shims/CometExprShim4x.scala  |  38 +++-
 .../comet/CometStructuredTextSuite.scala      | 170 ++++++++++++++++++
 12 files changed, 349 insertions(+), 8 deletions(-)
 create mode 100644 spark/src/main/scala/org/apache/comet/serde/csv.scala
 create mode 100644 spark/src/main/scala/org/apache/comet/serde/xpath.scala
 create mode 100644 spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala

diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index 3fbe052aff..10f01dd759 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -386,6 +386,7 @@ jobs:
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
+              org.apache.comet.CometStructuredTextSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.CometCodegenFuzzSuite
       fail-fast: false
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index dbdd325848..25c609010b 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -202,6 +202,7 @@ jobs:
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
+              org.apache.comet.CometStructuredTextSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.CometCodegenFuzzSuite
 
diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md
index 79093ddb9a..385b81e059 100644
--- a/docs/source/user-guide/latest/expressions.md
+++ b/docs/source/user-guide/latest/expressions.md
@@ -52,11 +52,9 @@ Comet focuses acceleration on mainstream relational, string, datetime, math, and
 expressions. The following function families are **not currently planned** for native acceleration (they are not on the 1.0 roadmap): specialized functionality with narrow real-world analytics use and high implementation cost. They fall back to Spark and may be reconsidered based on demand:
 
 - **Probabilistic sketches and approximate top-k** (`kll_sketch_*`, `hll_*`, `theta_*`, `count_min_sketch`, `bitmap_*`, `approx_top_k*`): specialized data structures with exact-correctness traps.
-- **XML / XPath** (`from_xml`, `to_xml`, `schema_of_xml`, `xpath*`): legacy text format, rare in accelerated workloads.
 - **Geospatial** (`st_*`): brand-new Spark 4.1 functionality, specialized.
 - **Avro / Protobuf codecs** (`from_avro`, `to_avro`, `from_protobuf`, `to_protobuf`, `schema_of_avro`): format conversion belongs at the IO layer, not expression evaluation.
 - **JVM reflection** (`java_method`, `reflect`): niche, and they invoke arbitrary JVM methods (a security concern).
-- **CSV functions** (`from_csv`, `to_csv`, `schema_of_csv`): row-level CSV parsing and formatting in expressions is niche and better handled at the data source layer.
 - **UTF-8 validation** (`is_valid_utf8`, `make_valid_utf8`, `validate_utf8`, `try_validate_utf8`): niche Spark 4.x string-validation helpers.
 - **File metadata** (`input_file_name`, `input_file_block_start`, `input_file_block_length`): require scan-internal per-row file information, outside the expression layer.
 - **Miscellaneous niche** (`histogram_numeric`, `version`, `sentences`, `quote`): low-value or specialized functions with little benefit from native acceleration.
@@ -220,6 +218,16 @@ The type-name conversion functions (`bigint`, `binary`, `boolean`, `date`, `deci
 
 ---
 
+## csv_funcs
+
+| Function | Status | Notes |
+| --- | --- | --- |
+| `from_csv` | ✅ |  |
+| `schema_of_csv` | ✅ |  |
+| `to_csv` | ✅ |  |
+
+---
+
 ## datetime_funcs
 
 | Function | Status | Notes |
@@ -339,9 +347,9 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
 | `from_json` | ✅ | Falls back by default; opt-in via allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#from_json)) |
 | `get_json_object` | ✅ | Some inputs need allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#get_json_object)) |
 | `json_array_length` | ✅ | Single-quoted/trailing JSON needs allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#json_array_length)) |
-| `json_object_keys` | 🔜 | [#3161](https://github.com/apache/datafusion-comet/issues/3161) |
+| `json_object_keys` | ✅ |  |
 | `json_tuple` | 🔜 | [#3160](https://github.com/apache/datafusion-comet/issues/3160) |
-| `schema_of_json` | 🔜 | [#3163](https://github.com/apache/datafusion-comet/issues/3163) |
+| `schema_of_json` | ✅ |  |
 | `to_json` | ✅ | Options and map/array inputs fall back ([audit](../../contributor-guide/expression-audits/json_funcs.md#to_json)) |
 
 ---
@@ -639,6 +647,25 @@ fall back to Spark.
 
 ---
 
+## xml_funcs
+
+| Function | Status | Notes |
+| --- | --- | --- |
+| `from_xml` | ✅ | Spark 4.0+ |
+| `schema_of_xml` | ✅ | Spark 4.0+ |
+| `to_xml` | ✅ | Spark 4.0+ |
+| `xpath` | ✅ |  |
+| `xpath_boolean` | ✅ |  |
+| `xpath_double` | ✅ |  |
+| `xpath_float` | ✅ |  |
+| `xpath_int` | ✅ |  |
+| `xpath_long` | ✅ |  |
+| `xpath_number` | ✅ | Alias of `xpath_double` |
+| `xpath_short` | ✅ |  |
+| `xpath_string` | ✅ |  |
+
+---
+
 ## Beyond SQL functions
 
 Comet also accelerates a number of Catalyst expressions that have no Spark SQL function name and therefore do not appear in the tables above. These arise from the DataFrame API, from SQL syntax other than function calls, or from the query optimizer. They include:
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 2c49114dd8..6a05d413f6 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -28,6 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
+import org.apache.spark.sql.catalyst.expressions.xml.{XPathBoolean, XPathDouble, XPathFloat, XPathInt, XPathList, XPathLong, XPathShort, XPathString}
 import org.apache.spark.sql.comet.DecimalPrecision
 import org.apache.spark.sql.execution.{ScalarSubquery, SparkPlan}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
@@ -273,7 +274,22 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
     classOf[Cast] -> CometCast)
 
   private val jsonExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
-    classOf[LengthOfJsonArray] -> CometLengthOfJsonArray)
+    classOf[LengthOfJsonArray] -> CometLengthOfJsonArray,
+    classOf[SchemaOfJson] -> CometSchemaOfJson,
+    classOf[JsonObjectKeys] -> CometJsonObjectKeys)
+
+  private val csvExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] =
+    Map(classOf[CsvToStructs] -> CometCsvToStructs, classOf[SchemaOfCsv] -> CometSchemaOfCsv)
+
+  private val xpathExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
+    classOf[XPathBoolean] -> CometXPathBoolean,
+    classOf[XPathShort] -> CometXPathShort,
+    classOf[XPathInt] -> CometXPathInt,
+    classOf[XPathLong] -> CometXPathLong,
+    classOf[XPathFloat] -> CometXPathFloat,
+    classOf[XPathDouble] -> CometXPathDouble,
+    classOf[XPathString] -> CometXPathString,
+    classOf[XPathList] -> CometXPathList)
 
   private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
     // TODO PromotePrecision
@@ -301,7 +317,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
     mathExpressions ++ hashExpressions ++ stringExpressions ++
       conditionalExpressions ++ mapExpressions ++ predicateExpressions ++
       structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++
-      temporalExpressions ++ conversionExpressions ++ urlExpressions ++ jsonExpressions
+      temporalExpressions ++ conversionExpressions ++ urlExpressions ++ jsonExpressions ++
+      csvExpressions ++ xpathExpressions
 
   /**
    * Mapping of Spark aggregate expression class to Comet expression handler.
diff --git a/spark/src/main/scala/org/apache/comet/serde/csv.scala b/spark/src/main/scala/org/apache/comet/serde/csv.scala
new file mode 100644
index 0000000000..38ec18e2af
--- /dev/null
+++ b/spark/src/main/scala/org/apache/comet/serde/csv.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.serde
+
+import org.apache.spark.sql.catalyst.expressions.{CsvToStructs, SchemaOfCsv}
+
+object CometCsvToStructs extends CometCodegenDispatch[CsvToStructs]
+
+object CometSchemaOfCsv extends CometCodegenDispatch[SchemaOfCsv]
diff --git a/spark/src/main/scala/org/apache/comet/serde/json.scala b/spark/src/main/scala/org/apache/comet/serde/json.scala
index 0eac414d13..2c210fa26c 100644
--- a/spark/src/main/scala/org/apache/comet/serde/json.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/json.scala
@@ -19,7 +19,7 @@
 
 package org.apache.comet.serde
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, LengthOfJsonArray}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, JsonObjectKeys, LengthOfJsonArray, SchemaOfJson}
 
 import org.apache.comet.CometConf
 import org.apache.comet.serde.ExprOuterClass.Expr
@@ -46,3 +46,7 @@ object CometLengthOfJsonArray extends CometCodegenDispatch[LengthOfJsonArray] {
       super.convert(expr, inputs, binding)
     }
 }
+
+object CometSchemaOfJson extends CometCodegenDispatch[SchemaOfJson]
+
+object CometJsonObjectKeys extends CometCodegenDispatch[JsonObjectKeys]
diff --git a/spark/src/main/scala/org/apache/comet/serde/xpath.scala b/spark/src/main/scala/org/apache/comet/serde/xpath.scala
new file mode 100644
index 0000000000..5fa4a68f2c
--- /dev/null
+++ b/spark/src/main/scala/org/apache/comet/serde/xpath.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.serde
+
+import org.apache.spark.sql.catalyst.expressions.xml.{XPathBoolean, XPathDouble, XPathFloat, XPathInt, XPathList, XPathLong, XPathShort, XPathString}
+
+object CometXPathBoolean extends CometCodegenDispatch[XPathBoolean]
+
+object CometXPathShort extends CometCodegenDispatch[XPathShort]
+
+object CometXPathInt extends CometCodegenDispatch[XPathInt]
+
+object CometXPathLong extends CometCodegenDispatch[XPathLong]
+
+object CometXPathFloat extends CometCodegenDispatch[XPathFloat]
+
+object CometXPathDouble extends CometCodegenDispatch[XPathDouble]
+
+object CometXPathString extends CometCodegenDispatch[XPathString]
+
+object CometXPathList extends CometCodegenDispatch[XPathList]
diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
index 9163d254a8..741b188663 100644
--- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
@@ -59,6 +59,13 @@ trait CometExprShim extends CommonStringExprs with CometExprShim4x {
       inputs: Seq[Attribute],
       binding: Boolean): Option[Expr] = {
     expr match {
+      // RuntimeReplaceable structured-text functions (schema_of_csv/json, json_object_keys,
+      // xpath_*, schema_of_xml) and from_xml/to_xml route through the codegen dispatcher; see
+      // CometExprShim4x.convertStructuredText. Guarded so non-structured-text Invoke/StaticInvoke
+      // nodes still reach their existing handlers below.
+      case e if isStructuredTextDispatch(e) =>
+        convertStructuredText(e, inputs, binding)
+
       case knc: KnownNotContainsNull =>
         // On Spark 4.0, array_compact rewrites to KnownNotContainsNull(ArrayFilter(IsNotNull)).
         // Strip the wrapper and serialize the inner ArrayFilter as spark_array_compact.
diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala
index d2813b62b8..e0f8584850 100644
--- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala
@@ -59,6 +59,13 @@ trait CometExprShim extends CommonStringExprs with CometExprShim4x {
       inputs: Seq[Attribute],
       binding: Boolean): Option[Expr] = {
     expr match {
+      // RuntimeReplaceable structured-text functions (schema_of_csv/json, json_object_keys,
+      // xpath_*, schema_of_xml) and from_xml/to_xml route through the codegen dispatcher; see
+      // CometExprShim4x.convertStructuredText. Guarded so non-structured-text Invoke/StaticInvoke
+      // nodes still reach their existing handlers below.
+      case e if isStructuredTextDispatch(e) =>
+        convertStructuredText(e, inputs, binding)
+
       case knc: KnownNotContainsNull =>
         // On Spark 4.0, array_compact rewrites to KnownNotContainsNull(ArrayFilter(IsNotNull)).
         // Strip the wrapper and serialize the inner ArrayFilter as spark_array_compact.
diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala
index d2813b62b8..e0f8584850 100644
--- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala
@@ -59,6 +59,13 @@ trait CometExprShim extends CommonStringExprs with CometExprShim4x {
       inputs: Seq[Attribute],
       binding: Boolean): Option[Expr] = {
     expr match {
+      // RuntimeReplaceable structured-text functions (schema_of_csv/json, json_object_keys,
+      // xpath_*, schema_of_xml) and from_xml/to_xml route through the codegen dispatcher; see
+      // CometExprShim4x.convertStructuredText. Guarded so non-structured-text Invoke/StaticInvoke
+      // nodes still reach their existing handlers below.
+      case e if isStructuredTextDispatch(e) =>
+        convertStructuredText(e, inputs, binding)
+
       case knc: KnownNotContainsNull =>
         // On Spark 4.0, array_compact rewrites to KnownNotContainsNull(ArrayFilter(IsNotNull)).
         // Strip the wrapper and serialize the inner ArrayFilter as spark_array_compact.
diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala
index f2a1e32a6f..349e34bb70 100644
--- a/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala
+++ b/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala
@@ -19,8 +19,13 @@
 
 package org.apache.comet.shims
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, DayName, Expression, MonthName}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, DayName, Expression, Literal, MonthName, StructsToXml, XmlToStructs}
+import org.apache.spark.sql.catalyst.expressions.csv.SchemaOfCsvEvaluator
+import org.apache.spark.sql.catalyst.expressions.json.{JsonExpressionUtils, SchemaOfJsonEvaluator}
+import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
+import org.apache.spark.sql.catalyst.expressions.xml.{XmlExpressionEvalUtils, XPathEvaluator}
 
+import org.apache.comet.serde.CometScalaUDF
 import org.apache.comet.serde.ExprOuterClass.Expr
 import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProtoWithReturnType}
 
@@ -56,4 +61,35 @@ trait CometExprShim4x {
       optExprWithFallbackReason(nameExpr, m, m.child)
     case _ => None
   }
+
+  // Spark 4.x lowers the RuntimeReplaceable structured-text functions to an evaluator-backed
+  // `Invoke` (`schema_of_csv`, `schema_of_json`, `xpath_*`) or `StaticInvoke`
+  // (`json_object_keys`, `schema_of_xml`) before Comet sees the plan, so the original expression
+  // class never reaches the serde map. `from_xml` / `to_xml` stay as plain expressions. None of
+  // these have a native (rust) implementation, so they route through the codegen dispatcher.
+  //
+  // [[isStructuredTextDispatch]] is a cheap structural predicate used as the shim match guard so
+  // the dispatch (which binds, runs `canHandle`, and closure-serializes) only happens once, in
+  // [[convertStructuredText]]. The single `XPathEvaluator` test covers all eight `xpath_*`
+  // functions, which lower to its subclasses.
+  protected def isStructuredTextDispatch(expr: Expression): Boolean = expr match {
+    case _: XmlToStructs | _: StructsToXml => true
+    case i: Invoke if i.functionName == "evaluate" =>
+      i.targetObject match {
+        case Literal(value, _) =>
+          value.isInstanceOf[SchemaOfCsvEvaluator] || value.isInstanceOf[SchemaOfJsonEvaluator] ||
+          value.isInstanceOf[XPathEvaluator]
+        case _ => false
+      }
+    case s: StaticInvoke =>
+      (s.staticObject == classOf[JsonExpressionUtils] && s.functionName == "jsonObjectKeys") ||
+      (s.staticObject == XmlExpressionEvalUtils.getClass && s.functionName == "schemaOfXml")
+    case _ => false
+  }
+
+  protected def convertStructuredText(
+      expr: Expression,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[Expr] =
+    CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding)
 }
diff --git a/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala b/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala
new file mode 100644
index 0000000000..f5b67a1357
--- /dev/null
+++ b/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometSparkSessionExtensions.isSpark40Plus
+
+/**
+ * Structured-text functions (CSV / JSON / XPath) that have no native (rust) implementation. They
+ * extend Spark's `CodegenFallback`, so they are wired into the codegen dispatcher via
+ * [[org.apache.comet.serde.CometCsvToStructs]] and friends: a top-level projection stays native
+ * (running Spark's own evaluation inside the Comet kernel) and matches Spark exactly. When the
+ * dispatcher is disabled, they have no native path and fall back to Spark.
+ */
+class CometStructuredTextSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  private def withStringTable(rows: String)(thunk: => Unit): Unit = {
+    withTable("t") {
+      sql("CREATE TABLE t (s STRING) USING parquet")
+      sql(s"INSERT INTO t VALUES $rows")
+      thunk
+    }
+  }
+
+  private val csvRows = "('1,abc'), ('2,def'), (''), (null)"
+  private val jsonRows = "('{\"a\":1,\"b\":2}'), ('{\"x\":true}'), ('{}'), (null)"
+  private val xmlRows = "('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (null)"
+
+  // Disable constant folding so functions whose arguments are literals (schema_of_*) actually
+  // execute inside Comet rather than being folded away at optimization time.
+  private def withoutConstantFolding(thunk: => Unit): Unit = {
+    withSQLConf(
+      SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+        "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
+      thunk
+    }
+  }
+
+  test("from_csv") {
+    withStringTable(csvRows) {
+      checkSparkAnswerAndOperator("SELECT from_csv(s, 'a INT, b STRING') FROM t")
+    }
+  }
+
+  test("schema_of_csv") {
+    withoutConstantFolding {
+      checkSparkAnswerAndOperator("SELECT schema_of_csv('1,abc')")
+    }
+  }
+
+  test("schema_of_json") {
+    withoutConstantFolding {
+      checkSparkAnswerAndOperator("SELECT schema_of_json('{\"a\":1,\"b\":\"str\"}')")
+    }
+  }
+
+  test("json_object_keys") {
+    withStringTable(jsonRows) {
+      checkSparkAnswerAndOperator("SELECT json_object_keys(s) FROM t")
+    }
+  }
+
+  test("xpath_boolean") {
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator("SELECT xpath_boolean(s, 'a/b') FROM t")
+    }
+  }
+
+  test("xpath_short / xpath_int / xpath_long") {
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator(
+        "SELECT xpath_short(s, 'sum(a/b)'), xpath_int(s, 'sum(a/b)'), " +
+          "xpath_long(s, 'sum(a/b)') FROM t")
+    }
+  }
+
+  test("xpath_float / xpath_double") {
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator(
+        "SELECT xpath_float(s, 'sum(a/b)'), xpath_double(s, 'sum(a/b)') FROM t")
+    }
+  }
+
+  test("xpath_string") {
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator("SELECT xpath_string(s, 'a/b[1]') FROM t")
+    }
+  }
+
+  test("xpath") {
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator("SELECT xpath(s, 'a/b/text()') FROM t")
+    }
+  }
+
+  // XML SQL functions (from_xml / to_xml / schema_of_xml) were added in Spark 4.0, so these run
+  // only on 4.0+.
+
+  test("from_xml") {
+    assume(isSpark40Plus)
+    withStringTable(xmlRows) {
+      checkSparkAnswerAndOperator("SELECT from_xml(s, 'b ARRAY<INT>') FROM t")
+    }
+  }
+
+  test("to_xml") {
+    assume(isSpark40Plus)
+    withStringTable(csvRows) {
+      checkSparkAnswerAndOperator("SELECT to_xml(named_struct('a', 1, 'b', s)) FROM t")
+    }
+  }
+
+  test("schema_of_xml") {
+    assume(isSpark40Plus)
+    withoutConstantFolding {
+      checkSparkAnswerAndOperator("SELECT schema_of_xml('<a><b>1</b></a>')")
+    }
+  }
+
+  test("from_xml falls back to Spark when codegen dispatcher disabled") {
+    assume(isSpark40Plus)
+    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
+      withStringTable(xmlRows) {
+        checkSparkAnswerAndFallbackReason(
+          "SELECT from_xml(s, 'b ARRAY<INT>') FROM t",
+          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
+      }
+    }
+  }
+
+  test("from_csv falls back to Spark when codegen dispatcher disabled") {
+    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
+      withStringTable(csvRows) {
+        checkSparkAnswerAndFallbackReason(
+          "SELECT from_csv(s, 'a INT, b STRING') FROM t",
+          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
+      }
+    }
+  }
+
+  test("xpath falls back to Spark when codegen dispatcher disabled") {
+    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
+      withStringTable(xmlRows) {
+        checkSparkAnswerAndFallbackReason(
+          "SELECT xpath_int(s, 'sum(a/b)') FROM t",
+          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
+      }
+    }
+  }
+}

From 232b51beaacd7f796f46bd39af82a06e2bece2b8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 10 Jun 2026 17:15:53 -0600
Subject: [PATCH 2/2] test: migrate structured-text Scala tests to Comet SQL
 file tests

Replace CometStructuredTextSuite with CometSqlFileTestSuite fixtures for the
CSV / JSON / XPath / XML structured-text functions:

- csv/csv.sql, csv/csv_fallback.sql
- json/schema_of_json.sql, json/json_object_keys.sql
- xpath/xpath.sql, xpath/xpath_fallback.sql
- xml/xml.sql, xml/xml_fallback.sql (MinSparkVersion 4.0)

The SQL suite disables constant folding, so all-literal schema_of_* queries run
natively without extra wiring. Fallback coverage uses a per-file Config that
disables the codegen dispatcher with expect_fallback. Drop CometStructuredTextSuite
from the CI workflows since CometSqlFileTestSuite already runs there.
---
 .github/workflows/pr_build_linux.yml          |   1 -
 .github/workflows/pr_build_macos.yml          |   1 -
 .../sql-tests/expressions/csv/csv.sql         |  34 ++++
 .../expressions/csv/csv_fallback.sql          |  29 +++
 .../expressions/json/json_object_keys.sql     |  29 +++
 .../expressions/json/schema_of_json.sql       |  23 +++
 .../sql-tests/expressions/xml/xml.sql         |  38 ++++
 .../expressions/xml/xml_fallback.sql          |  33 ++++
 .../sql-tests/expressions/xpath/xpath.sql     |  41 +++++
 .../expressions/xpath/xpath_fallback.sql      |  31 ++++
 .../comet/CometStructuredTextSuite.scala      | 170 ------------------
 11 files changed, 258 insertions(+), 172 deletions(-)
 create mode 100644 spark/src/test/resources/sql-tests/expressions/csv/csv.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/csv/csv_fallback.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/json/json_object_keys.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/json/schema_of_json.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/xml/xml.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/xml/xml_fallback.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/xpath/xpath.sql
 create mode 100644 spark/src/test/resources/sql-tests/expressions/xpath/xpath_fallback.sql
 delete mode 100644 spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala

diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index 10f01dd759..3fbe052aff 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -386,7 +386,6 @@ jobs:
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
-              org.apache.comet.CometStructuredTextSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.CometCodegenFuzzSuite
       fail-fast: false
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index 25c609010b..dbdd325848 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -202,7 +202,6 @@ jobs:
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
-              org.apache.comet.CometStructuredTextSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.CometCodegenFuzzSuite
 
diff --git a/spark/src/test/resources/sql-tests/expressions/csv/csv.sql b/spark/src/test/resources/sql-tests/expressions/csv/csv.sql
new file mode 100644
index 0000000000..c7738948a8
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/csv/csv.sql
@@ -0,0 +1,34 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- CSV structured-text functions (from_csv, schema_of_csv). These have no native (rust)
+-- implementation; they extend Spark's CodegenFallback and stay native via the codegen
+-- dispatcher.
+
+statement
+CREATE TABLE test_csv(s STRING) USING parquet
+
+statement
+INSERT INTO test_csv VALUES ('1,abc'), ('2,def'), (''), (NULL)
+
+-- column argument
+query
+SELECT from_csv(s, 'a INT, b STRING') FROM test_csv
+
+-- literal argument
+query
+SELECT schema_of_csv('1,abc')
diff --git a/spark/src/test/resources/sql-tests/expressions/csv/csv_fallback.sql b/spark/src/test/resources/sql-tests/expressions/csv/csv_fallback.sql
new file mode 100644
index 0000000000..427c54c876
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/csv/csv_fallback.sql
@@ -0,0 +1,29 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- With the codegen dispatcher disabled, from_csv has no native path and falls back to Spark.
+
+-- Config: spark.comet.exec.scalaUDF.codegen.enabled=false
+
+statement
+CREATE TABLE test_csv_fallback(s STRING) USING parquet
+
+statement
+INSERT INTO test_csv_fallback VALUES ('1,abc'), ('2,def'), (''), (NULL)
+
+query expect_fallback(spark.comet.exec.scalaUDF.codegen.enabled)
+SELECT from_csv(s, 'a INT, b STRING') FROM test_csv_fallback
diff --git a/spark/src/test/resources/sql-tests/expressions/json/json_object_keys.sql b/spark/src/test/resources/sql-tests/expressions/json/json_object_keys.sql
new file mode 100644
index 0000000000..128218515b
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/json/json_object_keys.sql
@@ -0,0 +1,29 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- json_object_keys has no native (rust) implementation; it extends Spark's CodegenFallback and
+-- stays native via the codegen dispatcher.
+
+statement
+CREATE TABLE test_json_object_keys(s STRING) USING parquet
+
+statement
+INSERT INTO test_json_object_keys VALUES
+  ('{"a":1,"b":2}'), ('{"x":true}'), ('{}'), (NULL)
+
+query
+SELECT json_object_keys(s) FROM test_json_object_keys
diff --git a/spark/src/test/resources/sql-tests/expressions/json/schema_of_json.sql b/spark/src/test/resources/sql-tests/expressions/json/schema_of_json.sql
new file mode 100644
index 0000000000..84d99901bd
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/json/schema_of_json.sql
@@ -0,0 +1,23 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- schema_of_json has no native (rust) implementation; it extends Spark's CodegenFallback and
+-- stays native via the codegen dispatcher. Its argument is a literal, which the suite evaluates
+-- natively because it disables constant folding.
+
+query
+SELECT schema_of_json('{"a":1,"b":"str"}')
diff --git a/spark/src/test/resources/sql-tests/expressions/xml/xml.sql b/spark/src/test/resources/sql-tests/expressions/xml/xml.sql
new file mode 100644
index 0000000000..19cee91f81
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/xml/xml.sql
@@ -0,0 +1,38 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- XML SQL functions (from_xml, to_xml, schema_of_xml) were added in Spark 4.0. They have no
+-- native (rust) implementation; they stay native via the codegen dispatcher.
+
+-- MinSparkVersion: 4.0
+
+statement
+CREATE TABLE test_xml(s STRING) USING parquet
+
+statement
+INSERT INTO test_xml VALUES
+  ('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (NULL)
+
+query
+SELECT from_xml(s, 'b ARRAY<INT>') FROM test_xml
+
+query
+SELECT to_xml(named_struct('a', 1, 'b', s)) FROM test_xml
+
+-- literal argument
+query
+SELECT schema_of_xml('<a><b>1</b></a>')
diff --git a/spark/src/test/resources/sql-tests/expressions/xml/xml_fallback.sql b/spark/src/test/resources/sql-tests/expressions/xml/xml_fallback.sql
new file mode 100644
index 0000000000..6f28a910a1
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/xml/xml_fallback.sql
@@ -0,0 +1,33 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- With the codegen dispatcher disabled, from_xml has no native path and falls back to Spark.
+-- XML SQL functions were added in Spark 4.0.
+
+-- MinSparkVersion: 4.0
+
+-- Config: spark.comet.exec.scalaUDF.codegen.enabled=false
+
+statement
+CREATE TABLE test_xml_fallback(s STRING) USING parquet
+
+statement
+INSERT INTO test_xml_fallback VALUES
+  ('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (NULL)
+
+query expect_fallback(spark.comet.exec.scalaUDF.codegen.enabled)
+SELECT from_xml(s, 'b ARRAY<INT>') FROM test_xml_fallback
diff --git a/spark/src/test/resources/sql-tests/expressions/xpath/xpath.sql b/spark/src/test/resources/sql-tests/expressions/xpath/xpath.sql
new file mode 100644
index 0000000000..08ef429f3a
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/xpath/xpath.sql
@@ -0,0 +1,41 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- The xpath family has no native (rust) implementation; these extend Spark's CodegenFallback and
+-- stay native via the codegen dispatcher.
+
+statement
+CREATE TABLE test_xpath(s STRING) USING parquet
+
+statement
+INSERT INTO test_xpath VALUES
+  ('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (NULL)
+
+query
+SELECT xpath_boolean(s, 'a/b') FROM test_xpath
+
+query
+SELECT xpath_short(s, 'sum(a/b)'), xpath_int(s, 'sum(a/b)'), xpath_long(s, 'sum(a/b)') FROM test_xpath
+
+query
+SELECT xpath_float(s, 'sum(a/b)'), xpath_double(s, 'sum(a/b)') FROM test_xpath
+
+query
+SELECT xpath_string(s, 'a/b[1]') FROM test_xpath
+
+query
+SELECT xpath(s, 'a/b/text()') FROM test_xpath
diff --git a/spark/src/test/resources/sql-tests/expressions/xpath/xpath_fallback.sql b/spark/src/test/resources/sql-tests/expressions/xpath/xpath_fallback.sql
new file mode 100644
index 0000000000..e11aac0fab
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/xpath/xpath_fallback.sql
@@ -0,0 +1,31 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- With the codegen dispatcher disabled, the xpath family has no native path and falls back to
+-- Spark.
+
+-- Config: spark.comet.exec.scalaUDF.codegen.enabled=false
+
+statement
+CREATE TABLE test_xpath_fallback(s STRING) USING parquet
+
+statement
+INSERT INTO test_xpath_fallback VALUES
+  ('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (NULL)
+
+query expect_fallback(spark.comet.exec.scalaUDF.codegen.enabled)
+SELECT xpath_int(s, 'sum(a/b)') FROM test_xpath_fallback
diff --git a/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala b/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala
deleted file mode 100644
index f5b67a1357..0000000000
--- a/spark/src/test/scala/org/apache/comet/CometStructuredTextSuite.scala
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.comet
-
-import org.apache.spark.sql.CometTestBase
-import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
-import org.apache.spark.sql.internal.SQLConf
-
-import org.apache.comet.CometSparkSessionExtensions.isSpark40Plus
-
-/**
- * Structured-text functions (CSV / JSON / XPath) that have no native (rust) implementation. They
- * extend Spark's `CodegenFallback`, so they are wired into the codegen dispatcher via
- * [[org.apache.comet.serde.CometCsvToStructs]] and friends: a top-level projection stays native
- * (running Spark's own evaluation inside the Comet kernel) and matches Spark exactly. When the
- * dispatcher is disabled, they have no native path and fall back to Spark.
- */
-class CometStructuredTextSuite extends CometTestBase with AdaptiveSparkPlanHelper {
-
-  private def withStringTable(rows: String)(thunk: => Unit): Unit = {
-    withTable("t") {
-      sql("CREATE TABLE t (s STRING) USING parquet")
-      sql(s"INSERT INTO t VALUES $rows")
-      thunk
-    }
-  }
-
-  private val csvRows = "('1,abc'), ('2,def'), (''), (null)"
-  private val jsonRows = "('{\"a\":1,\"b\":2}'), ('{\"x\":true}'), ('{}'), (null)"
-  private val xmlRows = "('<a><b>1</b><b>2</b></a>'), ('<a><b>9</b></a>'), ('<a/>'), (null)"
-
-  // Disable constant folding so functions whose arguments are literals (schema_of_*) actually
-  // execute inside Comet rather than being folded away at optimization time.
-  private def withoutConstantFolding(thunk: => Unit): Unit = {
-    withSQLConf(
-      SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
-        "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
-      thunk
-    }
-  }
-
-  test("from_csv") {
-    withStringTable(csvRows) {
-      checkSparkAnswerAndOperator("SELECT from_csv(s, 'a INT, b STRING') FROM t")
-    }
-  }
-
-  test("schema_of_csv") {
-    withoutConstantFolding {
-      checkSparkAnswerAndOperator("SELECT schema_of_csv('1,abc')")
-    }
-  }
-
-  test("schema_of_json") {
-    withoutConstantFolding {
-      checkSparkAnswerAndOperator("SELECT schema_of_json('{\"a\":1,\"b\":\"str\"}')")
-    }
-  }
-
-  test("json_object_keys") {
-    withStringTable(jsonRows) {
-      checkSparkAnswerAndOperator("SELECT json_object_keys(s) FROM t")
-    }
-  }
-
-  test("xpath_boolean") {
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator("SELECT xpath_boolean(s, 'a/b') FROM t")
-    }
-  }
-
-  test("xpath_short / xpath_int / xpath_long") {
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator(
-        "SELECT xpath_short(s, 'sum(a/b)'), xpath_int(s, 'sum(a/b)'), " +
-          "xpath_long(s, 'sum(a/b)') FROM t")
-    }
-  }
-
-  test("xpath_float / xpath_double") {
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator(
-        "SELECT xpath_float(s, 'sum(a/b)'), xpath_double(s, 'sum(a/b)') FROM t")
-    }
-  }
-
-  test("xpath_string") {
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator("SELECT xpath_string(s, 'a/b[1]') FROM t")
-    }
-  }
-
-  test("xpath") {
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator("SELECT xpath(s, 'a/b/text()') FROM t")
-    }
-  }
-
-  // XML SQL functions (from_xml / to_xml / schema_of_xml) were added in Spark 4.0, so these run
-  // only on 4.0+.
-
-  test("from_xml") {
-    assume(isSpark40Plus)
-    withStringTable(xmlRows) {
-      checkSparkAnswerAndOperator("SELECT from_xml(s, 'b ARRAY<INT>') FROM t")
-    }
-  }
-
-  test("to_xml") {
-    assume(isSpark40Plus)
-    withStringTable(csvRows) {
-      checkSparkAnswerAndOperator("SELECT to_xml(named_struct('a', 1, 'b', s)) FROM t")
-    }
-  }
-
-  test("schema_of_xml") {
-    assume(isSpark40Plus)
-    withoutConstantFolding {
-      checkSparkAnswerAndOperator("SELECT schema_of_xml('<a><b>1</b></a>')")
-    }
-  }
-
-  test("from_xml falls back to Spark when codegen dispatcher disabled") {
-    assume(isSpark40Plus)
-    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
-      withStringTable(xmlRows) {
-        checkSparkAnswerAndFallbackReason(
-          "SELECT from_xml(s, 'b ARRAY<INT>') FROM t",
-          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
-      }
-    }
-  }
-
-  test("from_csv falls back to Spark when codegen dispatcher disabled") {
-    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
-      withStringTable(csvRows) {
-        checkSparkAnswerAndFallbackReason(
-          "SELECT from_csv(s, 'a INT, b STRING') FROM t",
-          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
-      }
-    }
-  }
-
-  test("xpath falls back to Spark when codegen dispatcher disabled") {
-    withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") {
-      withStringTable(xmlRows) {
-        checkSparkAnswerAndFallbackReason(
-          "SELECT xpath_int(s, 'sum(a/b)') FROM t",
-          CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key)
-      }
-    }
-  }
-}