From 4f477be1c41649cd83a3e9c2c51e671918953dbb Mon Sep 17 00:00:00 2001 From: Takuya Ueshin Date: Fri, 6 Feb 2026 15:43:19 -0800 Subject: [PATCH 1/3] Handle an unexpected keyword argument error from read_excel with pandas 3 --- python/pyspark/pandas/namespace.py | 54 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index e2955a5745db9..31542b4d356d1 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -52,6 +52,8 @@ import pyarrow as pa import pyarrow.parquet as pq +from pyspark._globals import _NoValue, _NoValueType +from pyspark.loose_version import LooseVersion from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.functions import pandas_udf from pyspark.sql.types import ( @@ -941,7 +943,7 @@ def read_excel( keep_default_na: bool = True, verbose: bool = False, parse_dates: Union[bool, List, Dict] = False, - date_parser: Optional[Callable] = None, + date_parser: Union[Optional[Callable], _NoValueType] = _NoValue, thousands: Optional[str] = None, comment: Optional[str] = None, skipfooter: int = 0, @@ -1137,6 +1139,36 @@ def read_excel( 2 None NaN """ + kwargs = dict( + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + engine=engine, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + na_values=na_values, + keep_default_na=keep_default_na, + verbose=verbose, + parse_dates=parse_dates, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + **kwds, + ) + + if LooseVersion(pd.__version__) < "3.0.0": + if date_parser is not _NoValue: + kwargs["date_parser"] = date_parser + else: + if date_parser is not _NoValue: + raise ValueError( + "The 'date_parser' keyword is not supported in pandas 3.0.0 and later." + ) + def pd_read_excel( io_or_bin: Any, sn: Union[str, int, List[Union[str, int]], None], @@ -1145,26 +1177,8 @@ def pd_read_excel( return pd.read_excel( # type: ignore[call-overload, misc] io=BytesIO(io_or_bin) if isinstance(io_or_bin, (bytes, bytearray)) else io_or_bin, sheet_name=sn, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - dtype=dtype, - engine=engine, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, nrows=nr, - na_values=na_values, - keep_default_na=keep_default_na, - verbose=verbose, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - **kwds, + **kwargs, ) if not isinstance(io, str): From 6db2ebf2e3772d8d960f8064297596f1c48c2364 Mon Sep 17 00:00:00 2001 From: Takuya Ueshin Date: Fri, 6 Feb 2026 15:59:48 -0800 Subject: [PATCH 2/3] Fix. --- python/pyspark/pandas/namespace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 31542b4d356d1..5ce91f9afa881 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -1165,7 +1165,7 @@ def read_excel( kwargs["date_parser"] = date_parser else: if date_parser is not _NoValue: - raise ValueError( + raise TypeError( "The 'date_parser' keyword is not supported in pandas 3.0.0 and later." ) From 0dba95ca80a6c8a32cbd2f3f0d2d1ff775bf08ff Mon Sep 17 00:00:00 2001 From: Takuya Ueshin Date: Fri, 6 Feb 2026 17:41:55 -0800 Subject: [PATCH 3/3] Fix. --- python/pyspark/pandas/namespace.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 5ce91f9afa881..fd87fddb2678b 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -1165,16 +1165,14 @@ def read_excel( kwargs["date_parser"] = date_parser else: if date_parser is not _NoValue: - raise TypeError( - "The 'date_parser' keyword is not supported in pandas 3.0.0 and later." - ) + raise TypeError("The 'date_parser' keyword is not supported in pandas 3.0.0 and later.") def pd_read_excel( io_or_bin: Any, sn: Union[str, int, List[Union[str, int]], None], nr: Optional[int] = None, ) -> pd.DataFrame: - return pd.read_excel( # type: ignore[call-overload, misc] + return pd.read_excel( # type: ignore[return-value] io=BytesIO(io_or_bin) if isinstance(io_or_bin, (bytes, bytearray)) else io_or_bin, sheet_name=sn, nrows=nr,