feat: Better selectTyped behavior

Jolanrensen · web-flow · commit 5c4f344255f2 · 2021-10-05T10:18:25.000+03:00
* updated col().`as`() behavior and added single selectTyped() variant

* reverted col().`as`&lt;&gt;() behavior for simplicity. Updated selectTyped to accept any TypedColumn

* updated readme

* removed unnecessary imports
diff --git a/README.md b/README.md
@@ -192,6 +192,9 @@ to create `TypedColumn`s and with those a new Dataset from pieces of another usi
 ```kotlin
 val dataset: Dataset<YourClass> = ...
 val newDataset: Dataset<Pair<TypeA, TypeB>> = dataset.selectTyped(col(YourClass::colA), col(YourClass::colB))
+
+// Alternatively, for instance when working with a Dataset<Row>
+val typedDataset: Dataset<Pair<String, Int>> = otherDataset.selectTyped(col("a").`as`<String>(), col("b").`as`<Int>())
 ```
 
 ### Overload resolution ambiguity
diff --git a/kotlin-spark-api/2.4/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt b/kotlin-spark-api/2.4/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt
@@ -647,12 +647,19 @@ operator fun Column.get(key: Any): Column = getItem(key)
 fun lit(a: Any) = functions.lit(a)
 
 /**
- * Provides a type hint about the expected return value of this column.  This information can
+ * Provides a type hint about the expected return value of this column. This information can
  * be used by operations such as `select` on a [Dataset] to automatically convert the
  * results into the correct JVM types.
+ *
+ * ```
+ * val df: Dataset<Row> = ...
+ * val typedColumn: Dataset<Int> = df.selectTyped( col("a").`as`<Int>() )
+ * ```
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T> Column.`as`(): TypedColumn<Any, T> = `as`(encoder<T>())
 
+
 /**
  * Alias for [Dataset.joinWith] which passes "left" argument
  * and respects the fact that in result of left join right relation is nullable
@@ -809,45 +816,74 @@ fun <T> Dataset<T>.showDS(numRows: Int = 20, truncate: Boolean = true) = apply {
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
+inline fun <reified T, reified U1> Dataset<T>.selectTyped(
+    c1: TypedColumn<out Any, U1>,
+): Dataset<U1> = select(c1 as TypedColumn<T, U1>)
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
 ): Dataset<Pair<U1, U2>> =
-    select(c1, c2).map { Pair(it._1(), it._2()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+    ).map { Pair(it._1(), it._2()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
 ): Dataset<Triple<U1, U2, U3>> =
-    select(c1, c2, c3).map { Triple(it._1(), it._2(), it._3()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+    ).map { Triple(it._1(), it._2(), it._3()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3, reified U4> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
-    c4: TypedColumn<T, U4>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
+    c4: TypedColumn<out Any, U4>,
 ): Dataset<Arity4<U1, U2, U3, U4>> =
-    select(c1, c2, c3, c4).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+        c4 as TypedColumn<T, U4>,
+    ).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3, reified U4, reified U5> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
-    c4: TypedColumn<T, U4>,
-    c5: TypedColumn<T, U5>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
+    c4: TypedColumn<out Any, U4>,
+    c5: TypedColumn<out Any, U5>,
 ): Dataset<Arity5<U1, U2, U3, U4, U5>> =
-    select(c1, c2, c3, c4, c5).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
-
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+        c4 as TypedColumn<T, U4>,
+        c5 as TypedColumn<T, U5>,
+    ).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
 
 @OptIn(ExperimentalStdlibApi::class)
 inline fun <reified T> schema(map: Map<String, KType> = mapOf()) = schema(typeOf<T>(), map)
diff --git a/kotlin-spark-api/2.4/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt b/kotlin-spark-api/2.4/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt
@@ -22,7 +22,6 @@ import ch.tutteli.atrium.api.verbs.expect
 import io.kotest.core.spec.style.ShouldSpec
 import io.kotest.matchers.shouldBe
 import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.TypedColumn
 import org.apache.spark.sql.functions.*
 import org.apache.spark.sql.streaming.GroupState
 import org.apache.spark.sql.streaming.GroupStateTimeout
@@ -339,31 +338,34 @@ class ApiTest : ShouldSpec({
                     SomeClass(intArrayOf(1, 2, 4), 5),
                 )
 
-                val typedColumnA: TypedColumn<Any, IntArray> = dataset.col("a").`as`(encoder())
+                val newDS1WithAs: Dataset<Int> = dataset.selectTyped(
+                    col("b").`as`<Int>(),
+                )
+                newDS1WithAs.show()
 
-                val newDS2 = dataset.selectTyped(
+                val newDS2: Dataset<Pair<Int, Int>> = dataset.selectTyped(
 //                    col(SomeClass::a), NOTE that this doesn't work on 2.4, returnting a data class with an array in it
                     col(SomeClass::b),
                     col(SomeClass::b),
                 )
                 newDS2.show()
 
-                val newDS3 = dataset.selectTyped(
+                val newDS3: Dataset<Triple<Int, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::b),
                     col(SomeClass::b),
                     col(SomeClass::b),
                 )
                 newDS3.show()
 
-                val newDS4 = dataset.selectTyped(
+                val newDS4: Dataset<Arity4<Int, Int, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::b),
                     col(SomeClass::b),
                     col(SomeClass::b),
                     col(SomeClass::b),
                 )
                 newDS4.show()
 
-                val newDS5 = dataset.selectTyped(
+                val newDS5: Dataset<Arity5<Int, Int, Int, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::b),
                     col(SomeClass::b),
                     col(SomeClass::b),
diff --git a/kotlin-spark-api/3.0/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt b/kotlin-spark-api/3.0/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt
@@ -651,10 +651,16 @@ operator fun Column.get(key: Any): Column = getItem(key)
 fun lit(a: Any) = functions.lit(a)
 
 /**
- * Provides a type hint about the expected return value of this column.  This information can
+ * Provides a type hint about the expected return value of this column. This information can
  * be used by operations such as `select` on a [Dataset] to automatically convert the
  * results into the correct JVM types.
+ *
+ * ```
+ * val df: Dataset<Row> = ...
+ * val typedColumn: Dataset<Int> = df.selectTyped( col("a").`as`<Int>() )
+ * ```
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T> Column.`as`(): TypedColumn<Any, T> = `as`(encoder<T>())
 
 /**
@@ -776,9 +782,8 @@ inline fun <reified T, reified U> Dataset<T>.col(column: KProperty1<T, U>): Type
  * Returns a [Column] based on the given class attribute, not connected to a dataset.
  * ```kotlin
  *    val dataset: Dataset<YourClass> = ...
- *    val new: Dataset<Tuple2<TypeOfA, TypeOfB>> = dataset.select( col(YourClass::a), col(YourClass::b) )
+ *    val new: Dataset<Pair<TypeOfA, TypeOfB>> = dataset.select( col(YourClass::a), col(YourClass::b) )
  * ```
- * TODO: change example to [Pair]s when merged
  */
 @Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U> col(column: KProperty1<T, U>): TypedColumn<T, U> =
@@ -813,44 +818,74 @@ fun <T> Dataset<T>.showDS(numRows: Int = 20, truncate: Boolean = true) = apply {
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
+inline fun <reified T, reified U1> Dataset<T>.selectTyped(
+    c1: TypedColumn<out Any, U1>,
+): Dataset<U1> = select(c1 as TypedColumn<T, U1>)
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
 ): Dataset<Pair<U1, U2>> =
-    select(c1, c2).map { Pair(it._1(), it._2()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+    ).map { Pair(it._1(), it._2()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
 ): Dataset<Triple<U1, U2, U3>> =
-    select(c1, c2, c3).map { Triple(it._1(), it._2(), it._3()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+    ).map { Triple(it._1(), it._2(), it._3()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3, reified U4> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
-    c4: TypedColumn<T, U4>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
+    c4: TypedColumn<out Any, U4>,
 ): Dataset<Arity4<U1, U2, U3, U4>> =
-    select(c1, c2, c3, c4).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+        c4 as TypedColumn<T, U4>,
+    ).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
 
 /**
  * Returns a new Dataset by computing the given [Column] expressions for each element.
  */
+@Suppress("UNCHECKED_CAST")
 inline fun <reified T, reified U1, reified U2, reified U3, reified U4, reified U5> Dataset<T>.selectTyped(
-    c1: TypedColumn<T, U1>,
-    c2: TypedColumn<T, U2>,
-    c3: TypedColumn<T, U3>,
-    c4: TypedColumn<T, U4>,
-    c5: TypedColumn<T, U5>,
+    c1: TypedColumn<out Any, U1>,
+    c2: TypedColumn<out Any, U2>,
+    c3: TypedColumn<out Any, U3>,
+    c4: TypedColumn<out Any, U4>,
+    c5: TypedColumn<out Any, U5>,
 ): Dataset<Arity5<U1, U2, U3, U4, U5>> =
-    select(c1, c2, c3, c4, c5).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
+    select(
+        c1 as TypedColumn<T, U1>,
+        c2 as TypedColumn<T, U2>,
+        c3 as TypedColumn<T, U3>,
+        c4 as TypedColumn<T, U4>,
+        c5 as TypedColumn<T, U5>,
+    ).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
 
 
 @OptIn(ExperimentalStdlibApi::class)
diff --git a/kotlin-spark-api/3.0/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt b/kotlin-spark-api/3.0/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt
@@ -22,7 +22,6 @@ import ch.tutteli.atrium.api.verbs.expect
 import io.kotest.core.spec.style.ShouldSpec
 import io.kotest.matchers.shouldBe
 import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.TypedColumn
 import org.apache.spark.sql.functions.*
 import org.apache.spark.sql.streaming.GroupState
 import org.apache.spark.sql.streaming.GroupStateTimeout
@@ -364,30 +363,33 @@ class ApiTest : ShouldSpec({
                     SomeClass(intArrayOf(1, 2, 4), 5),
                 )
 
-                val typedColumnA: TypedColumn<Any, IntArray> = dataset.col("a").`as`(encoder())
+                val newDS1WithAs: Dataset<IntArray> = dataset.selectTyped(
+                    col("a").`as`<IntArray>(),
+                )
+                newDS1WithAs.show()
 
-                val newDS2 = dataset.selectTyped(
+                val newDS2: Dataset<Pair<IntArray, Int>> = dataset.selectTyped(
                     col(SomeClass::a), // NOTE: this only works on 3.0, returning a data class with an array in it
                     col(SomeClass::b),
                 )
                 newDS2.show()
 
-                val newDS3 = dataset.selectTyped(
+                val newDS3: Dataset<Triple<IntArray, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::a),
                     col(SomeClass::b),
                     col(SomeClass::b),
                 )
                 newDS3.show()
 
-                val newDS4 = dataset.selectTyped(
+                val newDS4: Dataset<Arity4<IntArray, Int, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::a),
                     col(SomeClass::b),
                     col(SomeClass::b),
                     col(SomeClass::b),
                 )
                 newDS4.show()
 
-                val newDS5 = dataset.selectTyped(
+                val newDS5: Dataset<Arity5<IntArray, Int, Int, Int, Int>> = dataset.selectTyped(
                     col(SomeClass::a),
                     col(SomeClass::b),
                     col(SomeClass::b),