apache · casgie · Feb 6, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 12, 2026
diff --git a/python/docs/source/tutorial/sql/python_data_source.rst b/python/docs/source/tutorial/sql/python_data_source.rst
@@ -512,6 +512,7 @@ The following example demonstrates how to implement a basic Data Source using Ar
 
     from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
     from pyspark.sql import SparkSession
+    from pyspark.sql.pandas.types import to_arrow_schema
     import pyarrow as pa
 
     # Define the ArrowBatchDataSource
@@ -534,14 +535,14 @@ The following example demonstrates how to implement a basic Data Source using Ar
     class ArrowBatchDataSourceReader(DataSourceReader):
         def __init__(self, schema, options):
             self.schema: str = schema
+            self.arrow_schema = to_arrow_schema(self.schema)
             self.options = options
 
         def read(self, partition):
             # Create Arrow Record Batch
             keys = pa.array([1, 2, 3, 4, 5], type=pa.int32())
             values = pa.array(["one", "two", "three", "four", "five"], type=pa.string())
-            schema = pa.schema([("key", pa.int32()), ("value", pa.string())])
-            record_batch = pa.RecordBatch.from_arrays([keys, values], schema=schema)
+            record_batch = pa.RecordBatch.from_arrays([keys, values], schema=self.arrow_schema)
             yield record_batch
 
         def partitions(self):

diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
@@ -37,6 +37,7 @@
 from pyspark.errors import PySparkNotImplementedError
 
 if TYPE_CHECKING:
+    import pyarrow as pa
     from pyarrow import RecordBatch
     from pyspark.sql.session import SparkSession
 
@@ -115,7 +116,7 @@ def name(cls) -> str:
         """
         return cls.__name__
 
-    def schema(self) -> Union[StructType, str]:
+    def schema(self) -> Union[StructType, str, "pa.schema"]:
         """
         Returns the schema of the data source.
 
@@ -142,6 +143,16 @@ def schema(self) -> Union[StructType, str]:
 
         >>> def schema(self):
         ...   return StructType().add("a", "int").add("b", "string")
+
+        Returns a PyArrow schema:
+
+        >>> def schema(self):
+        ...   return pa.schema([
+                pa.field("a", pa.int64()),
+                pa.field("b", pa.string()),
+            ])
+
+
         """
         raise PySparkNotImplementedError(
             errorClass="NOT_IMPLEMENTED",

diff --git a/python/pyspark/sql/worker/create_data_source.py b/python/pyspark/sql/worker/create_data_source.py
@@ -27,7 +27,7 @@
     write_with_length,
 )
 from pyspark.sql.datasource import DataSource, CaseInsensitiveDict
-from pyspark.sql.types import _parse_datatype_json_string, StructType
+from pyspark.sql.types import _parse_datatype_json_string, from_arrow_schema, StructType
 from pyspark.sql.worker.utils import worker_run
 from pyspark.util import local_connect_and_auth
 from pyspark.worker_util import (
@@ -36,6 +36,8 @@
     utf8_deserializer,
 )
 
+import pyarrow as pa
+
 
 def _main(infile: IO, outfile: IO) -> None:
     """
@@ -125,6 +127,11 @@ def _main(infile: IO, outfile: IO) -> None:
                 # Here we cannot use _parse_datatype_string to parse the DDL string schema.
                 # as it requires an active Spark session.
                 is_ddl_string = True
+            if isinstance(schema, pa.schema):
+                # Convert Arrow schema to Spark schema for compatibility, 
+                # as the data source API in Python allows data source to 
+                # return Arrow schema directly.
+                schema = from_arrow_schema(schema)
         else:
             schema = user_specified_schema  # type: ignore