diff --git a/python/pyspark/pandas/indexes/datetimes.py b/python/pyspark/pandas/indexes/datetimes.py index 772cb585f242b..cd90e49dc7eed 100644 --- a/python/pyspark/pandas/indexes/datetimes.py +++ b/python/pyspark/pandas/indexes/datetimes.py @@ -24,6 +24,7 @@ from pandas.tseries.offsets import DateOffset from pyspark._globals import _NoValue +from pyspark.loose_version import LooseVersion from pyspark import pandas as ps from pyspark.pandas import DataFrame from pyspark.pandas.indexes.base import Index @@ -109,8 +110,8 @@ def __new__( cls, data=None, freq=_NoValue, - normalize=False, - closed=None, + normalize=_NoValue, + closed=_NoValue, ambiguous="raise", dayfirst=False, yearfirst=False, @@ -118,30 +119,8 @@ def __new__( copy=False, name=None, ) -> "DatetimeIndex": - if closed is not None: - warnings.warn( - "The 'closed' keyword in DatetimeIndex construction is deprecated " - "and will be removed in a future version.", - FutureWarning, - ) - if normalize is not None: - warnings.warn( - "The 'normalize' keyword in DatetimeIndex construction is deprecated " - "and will be removed in a future version.", - FutureWarning, - ) - if not is_hashable(name): - raise TypeError("Index.name must be a hashable type") - - if isinstance(data, (Series, Index)): - if dtype is None: - dtype = "datetime64[ns]" - return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy, name=name)) - kwargs = dict( data=data, - normalize=normalize, - closed=closed, ambiguous=ambiguous, dayfirst=dayfirst, yearfirst=yearfirst, @@ -152,6 +131,39 @@ def __new__( if freq is not _NoValue: kwargs["freq"] = freq + if LooseVersion(pd.__version__) < "3.0.0": + if normalize is not _NoValue: + warnings.warn( + "The 'normalize' keyword in DatetimeIndex construction is deprecated " + "and will be removed in a future version.", + FutureWarning, + ) + kwargs["normalize"] = normalize + else: + kwargs["normalize"] = False + if closed is not _NoValue: + warnings.warn( + "The 'closed' keyword in DatetimeIndex construction is deprecated " + "and will be removed in a future version.", + FutureWarning, + ) + kwargs["closed"] = closed + else: + if normalize is not _NoValue: + raise TypeError( + "The 'normalize' keyword is not supported in pandas 3.0.0 and later." + ) + if closed is not _NoValue: + raise TypeError("The 'closed' keyword is not supported in pandas 3.0.0 and later.") + + if not is_hashable(name): + raise TypeError("Index.name must be a hashable type") + + if isinstance(data, (Series, Index)): + if dtype is None: + dtype = "datetime64[ns]" + return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy, name=name)) + return cast(DatetimeIndex, ps.from_pandas(pd.DatetimeIndex(**kwargs))) def __getattr__(self, item: str) -> Any: diff --git a/python/pyspark/pandas/indexes/timedelta.py b/python/pyspark/pandas/indexes/timedelta.py index 2138226d480fc..112d2bda0688d 100644 --- a/python/pyspark/pandas/indexes/timedelta.py +++ b/python/pyspark/pandas/indexes/timedelta.py @@ -133,10 +133,10 @@ def __new__( kwargs["closed"] = closed else: if unit is not _NoValue: - raise ValueError("The 'unit' keyword is not supported in pandas 3.0.0 and later.") + raise TypeError("The 'unit' keyword is not supported in pandas 3.0.0 and later.") if closed is not _NoValue: - raise ValueError("The 'closed' keyword is not supported in pandas 3.0.0 and later.") + raise TypeError("The 'closed' keyword is not supported in pandas 3.0.0 and later.") return cast(TimedeltaIndex, ps.from_pandas(pd.TimedeltaIndex(**kwargs))) diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index e2955a5745db9..fc49794358117 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -52,6 +52,8 @@ import pyarrow as pa import pyarrow.parquet as pq +from pyspark._globals import _NoValue, _NoValueType +from pyspark.loose_version import LooseVersion from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.functions import pandas_udf from pyspark.sql.types import ( @@ -1595,7 +1597,7 @@ def to_datetime( errors: str = "raise", format: Optional[str] = None, unit: Optional[str] = None, - infer_datetime_format: bool = False, + infer_datetime_format: Union[bool, _NoValueType] = _NoValue, origin: str = "unix", ): """ @@ -1735,19 +1737,29 @@ def to_datetime( "microseconds": "us", } + kwargs = dict( + errors=errors, + format=format, + unit=unit, + origin=origin, + ) + + if LooseVersion(pd.__version__) < "3.0.0": + kwargs["infer_datetime_format"] = ( + infer_datetime_format if infer_datetime_format is not _NoValue else False + ) + else: + if infer_datetime_format is not _NoValue: + raise TypeError( + "The 'infer_datetime_format' keyword is not supported in pandas 3.0.0 and later." + ) + def pandas_to_datetime( pser_or_pdf: Union[pd.DataFrame, pd.Series], cols: Optional[List[str]] = None ) -> Series[np.datetime64]: if isinstance(pser_or_pdf, pd.DataFrame): pser_or_pdf = pser_or_pdf[cols] - return pd.to_datetime( - pser_or_pdf, - errors=errors, - format=format, - unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, - ) + return pd.to_datetime(pser_or_pdf, **kwargs) if isinstance(arg, Series): return arg.pandas_on_spark.transform_batch(pandas_to_datetime) @@ -1762,14 +1774,7 @@ def pandas_to_datetime( psdf = arg[list_cols] return psdf.pandas_on_spark.transform_batch(pandas_to_datetime, list_cols) - return pd.to_datetime( - arg, - errors=errors, - format=format, - unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, - ) + return pd.to_datetime(arg, **kwargs) def date_range( diff --git a/python/pyspark/pandas/tests/series/test_conversion.py b/python/pyspark/pandas/tests/series/test_conversion.py index fa7ddba913e8b..2fa038c48d3fd 100644 --- a/python/pyspark/pandas/tests/series/test_conversion.py +++ b/python/pyspark/pandas/tests/series/test_conversion.py @@ -18,6 +18,7 @@ import pandas as pd +from pyspark.loose_version import LooseVersion from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils @@ -43,10 +44,12 @@ def test_to_datetime(self): pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100) psser = ps.from_pandas(pser) - self.assert_eq( - pd.to_datetime(pser, infer_datetime_format=True), - ps.to_datetime(psser, infer_datetime_format=True), - ) + self.assert_eq(pd.to_datetime(pser), ps.to_datetime(psser)) + if LooseVersion(pd.__version__) < "3.0.0": + self.assert_eq( + pd.to_datetime(pser, infer_datetime_format=True), + ps.to_datetime(psser, infer_datetime_format=True), + ) def test_to_list(self): self.assert_eq(self.psser.tolist(), self.pser.tolist())