Fix

xuzifu666 · xuzifu666 · commit 094d8244ee32 · 2026-04-08T21:29:43.000+08:00
diff --git a/auron-spark-tests/spark33/src/test/scala/org/apache/spark/sql/AuronInstrSuite.scala b/auron-spark-tests/spark33/src/test/scala/org/apache/spark/sql/AuronInstrSuite.scala
@@ -29,14 +29,17 @@ class AuronInstrSuite extends QueryTest with SparkQueryTestsBase {
     )
 
     val df = spark.createDataFrame(data).toDF("str", "substr")
-    val result = df.selectExpr("instr(str, substr)").collect().map(_.getInt(0))
-
-    assert(result(0) == 7, "instr('hello world', 'world') should return 7")
-    assert(result(1) == 1, "instr('hello world', 'hello') should return 1")
-    assert(result(2) == 5, "instr('hello world', 'o') should return 5")
-    assert(result(3) == 0, "instr('hello world', 'z') should return 0")
-    assert(result(4) == 0, "instr(null, 'test') should return null")
-    assert(result(5) == 0, "instr('test', null) should return null")
+    val rows = df.selectExpr("instr(str, substr)").collect()
+
+    // Check non-null results
+    assert(rows(0).getInt(0) == 7, "instr('hello world', 'world') should return 7")
+    assert(rows(1).getInt(0) == 1, "instr('hello world', 'hello') should return 1")
+    assert(rows(2).getInt(0) == 5, "instr('hello world', 'o') should return 5")
+    assert(rows(3).getInt(0) == 0, "instr('hello world', 'z') should return 0")
+    
+    // Check null results
+    assert(rows(4).isNullAt(0), "instr(null, 'test') should return null")
+    assert(rows(5).isNullAt(0), "instr('test', null) should return null")
   }
 
   test("test instr function - multiple occurrences") {
diff --git a/native-engine/datafusion-ext-functions/src/spark_instr.rs b/native-engine/datafusion-ext-functions/src/spark_instr.rs
@@ -70,7 +70,7 @@ pub fn spark_instr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
                     if substr.is_empty() {
                         Some(0)
                     } else {
-                        Some(s.find(substr).map(|pos| (pos + 1) as i32).unwrap_or(0))
+                        Some(find_char_position(s, substr))
                     }
                 }
             }),
@@ -88,6 +88,33 @@ pub fn spark_instr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     }
 }
 
+/// Find the 1-based character position of substr in s
+/// Returns 0 if not found
+fn find_char_position(s: &str, substr: &str) -> i32 {
+    if substr.is_empty() {
+        return 0;
+    }
+
+    // Use char_indices to get byte offset to char position mapping
+    let char_positions: Vec<usize> = s.char_indices().map(|(byte_pos, _)| byte_pos).collect();
+
+    // Find byte offset using find
+    if let Some(byte_pos) = s.find(substr) {
+        // Find the character position (1-based)
+        // char_positions contains the byte offset for each character
+        // We need to find which character index corresponds to this byte offset
+        for (char_idx, &char_byte_pos) in char_positions.iter().enumerate() {
+            if char_byte_pos == byte_pos {
+                return (char_idx + 1) as i32;
+            }
+        }
+        // Fallback: if exact match not found, estimate
+        char_positions.len() as i32 + 1
+    } else {
+        0
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::Arc;
@@ -211,4 +238,39 @@ mod test {
         );
         Ok(())
     }
+
+    #[test]
+    fn test_spark_instr_utf8() -> Result<()> {
+        // Test UTF-8 multi-byte characters
+        // "你好世界" - "世界" should return 3 (character position), not 6 (byte
+        // position)
+        let r = spark_instr(&vec![
+            ColumnarValue::Array(Arc::new(StringArray::from_iter(vec![
+                Some("你好世界".to_string()),
+                Some("hello世界".to_string()),
+                Some("test".to_string()),
+            ]))),
+            ColumnarValue::Scalar(ScalarValue::from("世界")),
+        ])?;
+        let s = r.into_array(3)?;
+        assert_eq!(
+            as_int32_array(&s)?.into_iter().collect::<Vec<_>>(),
+            vec![Some(3), Some(6), Some(0),]
+        );
+
+        // Test with emoji (4-byte UTF-8)
+        let r = spark_instr(&vec![
+            ColumnarValue::Array(Arc::new(StringArray::from_iter(vec![Some(
+                "hello😀world".to_string(),
+            )]))),
+            ColumnarValue::Scalar(ScalarValue::from("😀")),
+        ])?;
+        let s = r.into_array(1)?;
+        assert_eq!(
+            as_int32_array(&s)?.into_iter().collect::<Vec<_>>(),
+            vec![Some(6),]
+        );
+
+        Ok(())
+    }
 }
diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/auron/AuronFunctionSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/auron/AuronFunctionSuite.scala
@@ -868,6 +868,117 @@ class AuronFunctionSuite extends AuronQueryTest with BaseAuronSQLSuite {
               |""".stripMargin
         checkSparkAnswerAndOperator(query)
       }
+
+  test("instr function - basic functionality") {
+    withTable("t1") {
+      sql("""
+        CREATE TABLE t1(str STRING, substr STRING) USING parquet
+      """)
+      sql("""
+        INSERT INTO t1 VALUES
+          ('hello world', 'world'),
+          ('hello world', 'hello'),
+          ('hello world', 'o'),
+          ('hello world', 'z'),
+          (null, 'test'),
+          ('test', null)
+      """)
+
+      // Test basic instr functionality
+      checkSparkAnswerAndOperator("SELECT instr(str, substr) FROM t1")
+    }
+  }
+
+  test("instr function - empty substring") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING) USING parquet")
+      sql("INSERT INTO t1 VALUES ('hello'), ('world'), ('')")
+
+      // Empty substring should return 0
+      checkSparkAnswerAndOperator("SELECT instr(str, '') FROM t1")
+    }
+  }
+
+  test("instr function - UTF-8 multi-byte characters") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING, substr STRING) USING parquet")
+      sql("""
+        INSERT INTO t1 VALUES
+          ('你好世界', '世界'),
+          ('hello世界', '世界'),
+          ('test', '世界'),
+          ('hello😀world', '😀'),
+          ('test😀', '😀')
+      """)
+
+      // Test UTF-8 character position (not byte position)
+      checkSparkAnswerAndOperator("SELECT instr(str, substr) FROM t1")
+    }
+  }
+
+  test("instr function - with expressions") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING, substr STRING) USING parquet")
+      sql("INSERT INTO t1 VALUES ('banana', 'a'), ('testtesttest', 'test'), ('abcabcabc', 'abc')")
+
+      // Test with array column as substring (element-wise)
+      checkSparkAnswerAndOperator("SELECT instr(str, substr) FROM t1")
+    }
+  }
+
+  test("instr function - case sensitivity") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING, substr STRING) USING parquet")
+      sql("""
+        INSERT INTO t1 VALUES
+          ('Hello', 'hello'),
+          ('HELLO', 'hello'),
+          ('Hello', 'Hello'),
+          ('hElLo', 'hello')
+      """)
+
+      // Instr is case-sensitive
+      checkSparkAnswerAndOperator("SELECT instr(str, substr) FROM t1")
+    }
+  }
+
+  test("instr function - in filter clause") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING, substr STRING) USING parquet")
+      sql("""
+        INSERT INTO t1 VALUES
+          ('hello world', 'world'),
+          ('hello', 'world'),
+          ('testing', 'test'),
+          ('abc', 'def')
+      """)
+
+      // Test instr in WHERE clause
+      checkSparkAnswerAndOperator("""
+        SELECT str FROM t1 WHERE instr(str, substr) > 0
+      """)
+    }
+  }
+
+  test("instr function - with grouping") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(str STRING, substr STRING) USING parquet")
+      sql("""
+        INSERT INTO t1 VALUES
+          ('test1', 'test'),
+          ('test2', 'test'),
+          ('hello', 'world'),
+          ('testing', 'test')
+      """)
+
+      // Test instr in GROUP BY
+      checkSparkAnswerAndOperator("""
+        SELECT substr, COUNT(*) as cnt
+        FROM t1
+        WHERE instr(str, substr) > 0
+        GROUP BY substr
+        ORDER BY substr
+      """)
     }
   }
 }