From 01051ced8e7eb40910f859fddcbb4f96bf77fc86 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:04:22 +0000 Subject: [PATCH 01/11] [PYTHON][TEST] Add upstream test for Arrow array to pandas conversion with default args Add test_pyarrow_arrow_to_pandas_default.py to monitor PyArrow's to_pandas() behavior across all major Arrow types with golden file comparison, ensuring PySpark's assumptions remain valid across versions. Co-authored-by: Isaac --- ...golden_pyarrow_arrow_to_pandas_default.csv | 75 ++++ .../golden_pyarrow_arrow_to_pandas_default.md | 76 ++++ .../test_pyarrow_arrow_to_pandas_default.py | 376 ++++++++++++++++++ 3 files changed, 527 insertions(+) create mode 100644 python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv create mode 100644 python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md create mode 100644 python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv new file mode 100644 index 0000000000000..2fd187cd65920 --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -0,0 +1,75 @@ +source to_pandas() +int8:standard [0, 1, -1, 127, -128]@Series[int8] +int8:nullable [0.0, 1.0, nan]@Series[float64] +int16:standard [0, 1, -1, 32767, -32768]@Series[int16] +int16:nullable [0.0, 1.0, nan]@Series[float64] +int32:standard [0, 1, -1, 2147483647, -2147483648]@Series[int32] +int32:nullable [0.0, 1.0, nan]@Series[float64] +int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] +int64:nullable [0.0, 1.0, nan]@Series[float64] +uint8:standard [0, 1, 255]@Series[uint8] +uint8:nullable [0.0, 1.0, nan]@Series[float64] +uint16:standard [0, 1, 65535]@Series[uint16] +uint16:nullable [0.0, 1.0, nan]@Series[float64] +uint32:standard [0, 1, 4294967295]@Series[uint32] +uint32:nullable [0.0, 1.0, nan]@Series[float64] +uint64:standard [0, 1, 18446744073709551615]@Series[uint64] +uint64:nullable [0.0, 1.0, nan]@Series[float64] +float32:standard [0.0, 1.5, -1.5]@Series[float32] +float32:nullable [0.0, 1.5, nan]@Series[float32] +float64:standard [0.0, 1.5, -1.5]@Series[float64] +float64:nullable [0.0, 1.5, nan]@Series[float64] +float64:special [nan, inf, -inf]@Series[float64] +bool:standard [True, False, True]@Series[bool] +bool:nullable [True, False, None]@Series[object] +string:standard ['hello', 'world', '']@Series[object] +string:nullable ['hello', None, 'world']@Series[object] +large_string:standard ['hello', 'world']@Series[object] +large_string:nullable ['hello', None]@Series[object] +binary:standard [b'hello', b'world']@Series[object] +binary:nullable [b'hello', None]@Series[object] +large_binary:standard [b'hello', b'world']@Series[object] +large_binary:nullable [b'hello', None]@Series[object] +decimal128:standard [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] +decimal128:nullable [Decimal('1.23'), None, Decimal('4.56')]@Series[object] +date32:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date32:nullable [datetime.date(2024, 1, 1), None]@Series[object] +date64:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date64:nullable [datetime.date(2024, 1, 1), None]@Series[object] +timestamp[s]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] +timestamp[s]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] +timestamp[ms]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] +timestamp[ms]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] +timestamp[us]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] +timestamp[us]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] +timestamp[ns]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] +timestamp[ns]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] +timestamp[us,tz=UTC]:standard [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:nullable [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] +duration[s]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] +duration[s]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] +duration[ms]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] +duration[ms]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] +duration[us]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] +duration[us]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] +duration[ns]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] +duration[ns]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] +time32[s]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[s]:nullable [datetime.time(12, 30), None]@Series[object] +time32[ms]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[ms]:nullable [datetime.time(12, 30), None]@Series[object] +time64[us]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[us]:nullable [datetime.time(12, 30), None]@Series[object] +time64[ns]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[ns]:nullable [datetime.time(12, 30), None]@Series[object] +null:standard [None, None, None]@Series[object] +list:standard [array([1, 2]), array([3, 4, 5])]@Series[object] +list:nullable [array([1, 2]), None, array([3])]@Series[object] +list:standard [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] +large_list:standard [array([1, 2]), array([3, 4])]@Series[object] +fixed_size_list[3]:standard [array([1, 2, 3]), array([4, 5, 6])]@Series[object] +struct:standard [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] +struct:nullable [{'x': 1, 'y': 'a'}, None]@Series[object] +map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] +dictionary:standard ['a', 'b', 'a', 'b']@Series[category] +dictionary:nullable ['a', 'b', nan, 'a']@Series[category] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md new file mode 100644 index 0000000000000..9df631af77d6e --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -0,0 +1,76 @@ +| source | to_pandas() | +|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| int8:standard | [0, 1, -1, 127, -128]@Series[int8] | +| int8:nullable | [0.0, 1.0, nan]@Series[float64] | +| int16:standard | [0, 1, -1, 32767, -32768]@Series[int16] | +| int16:nullable | [0.0, 1.0, nan]@Series[float64] | +| int32:standard | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | +| int32:nullable | [0.0, 1.0, nan]@Series[float64] | +| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | +| int64:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint8:standard | [0, 1, 255]@Series[uint8] | +| uint8:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint16:standard | [0, 1, 65535]@Series[uint16] | +| uint16:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint32:standard | [0, 1, 4294967295]@Series[uint32] | +| uint32:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint64:standard | [0, 1, 18446744073709551615]@Series[uint64] | +| uint64:nullable | [0.0, 1.0, nan]@Series[float64] | +| float32:standard | [0.0, 1.5, -1.5]@Series[float32] | +| float32:nullable | [0.0, 1.5, nan]@Series[float32] | +| float64:standard | [0.0, 1.5, -1.5]@Series[float64] | +| float64:nullable | [0.0, 1.5, nan]@Series[float64] | +| float64:special | [nan, inf, -inf]@Series[float64] | +| bool:standard | [True, False, True]@Series[bool] | +| bool:nullable | [True, False, None]@Series[object] | +| string:standard | ['hello', 'world', '']@Series[object] | +| string:nullable | ['hello', None, 'world']@Series[object] | +| large_string:standard | ['hello', 'world']@Series[object] | +| large_string:nullable | ['hello', None]@Series[object] | +| binary:standard | [b'hello', b'world']@Series[object] | +| binary:nullable | [b'hello', None]@Series[object] | +| large_binary:standard | [b'hello', b'world']@Series[object] | +| large_binary:nullable | [b'hello', None]@Series[object] | +| decimal128:standard | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | +| decimal128:nullable | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | +| date32:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date32:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| date64:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date64:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| timestamp[s]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | +| timestamp[s]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | +| timestamp[ms]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | +| timestamp[ms]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | +| timestamp[us]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | +| timestamp[us]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | +| timestamp[ns]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | +| timestamp[ns]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | +| timestamp[us,tz=UTC]:standard | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:nullable | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | +| duration[s]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | +| duration[s]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | +| duration[ms]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | +| duration[ms]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | +| duration[us]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | +| duration[us]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | +| duration[ns]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | +| duration[ns]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | +| time32[s]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[s]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time32[ms]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[ms]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[us]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[us]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[ns]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[ns]:nullable | [datetime.time(12, 30), None]@Series[object] | +| null:standard | [None, None, None]@Series[object] | +| list:standard | [array([1, 2]), array([3, 4, 5])]@Series[object] | +| list:nullable | [array([1, 2]), None, array([3])]@Series[object] | +| list:standard | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | +| large_list:standard | [array([1, 2]), array([3, 4])]@Series[object] | +| fixed_size_list[3]:standard | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | +| struct:standard | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | +| struct:nullable | [{'x': 1, 'y': 'a'}, None]@Series[object] | +| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | +| dictionary:standard | ['a', 'b', 'a', 'b']@Series[category] | +| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | \ No newline at end of file diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py new file mode 100644 index 0000000000000..c921c7f171071 --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -0,0 +1,376 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Tests for PyArrow Array.to_pandas() with default arguments using golden file comparison. + +This test monitors the behavior of PyArrow's to_pandas() conversion to ensure +PySpark's assumptions about PyArrow behavior remain valid across versions. + +The test covers conversion of all major Arrow types to pandas/numpy with default +arguments (no types_mapper, no self_destruct, etc.), tracking: +- Which numpy/pandas dtype each Arrow type maps to +- How null values are handled (NaN, None, NaT, etc.) +- Whether values are preserved correctly after conversion + +## Golden File Cell Format + +Each cell uses the value@type format: +- numpy ndarray: "python_list_repr@ndarray[dtype]" +- pandas Series: "python_list_repr@Series[dtype]" +- pandas Categorical: "python_list_repr@Categorical[dtype]" +- Error: "ERR@ExceptionClassName" + +Values are formatted via tolist() for stable, Python-native representation. + +## Regenerating Golden Files + +Set SPARK_GENERATE_GOLDEN_FILES=1 before running: + + SPARK_GENERATE_GOLDEN_FILES=1 python -m pytest \\ + python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py + +## PyArrow Version Compatibility + +The golden files capture behavior for a specific PyArrow version. +Regenerate when upgrading PyArrow, as to_pandas() behavior may change. +""" + +import datetime +import inspect +import os +import unittest +from collections import OrderedDict +from decimal import Decimal + +from pyspark.testing.utils import ( + have_pyarrow, + have_pandas, + have_numpy, + pyarrow_requirement_message, + pandas_requirement_message, + numpy_requirement_message, +) +from pyspark.testing.goldenutils import GoldenFileTestMixin + + +@unittest.skipIf( + not have_pyarrow or not have_pandas or not have_numpy, + pyarrow_requirement_message or pandas_requirement_message or numpy_requirement_message, +) +class PyArrowArrayToPandasDefaultTests(GoldenFileTestMixin, unittest.TestCase): + """ + Tests pa.Array.to_pandas() with default arguments via golden file comparison. + + Covers all major Arrow types: integers, floats, bool, string, binary, + decimal, date, timestamp, duration, time, null, and nested types. + Each type is tested both without and with null values. + """ + + @staticmethod + def _repr_result(result, max_len=0): + """ + Format to_pandas() result for golden file comparison. + + Uses tolist() for stable Python-native value representation that does + not depend on numpy's string formatting, which can vary across versions. + + Returns + ------- + str + "python_list_repr@result_class[dtype]" + e.g. "[0, 1, -1, 127, -128]@ndarray[int8]" + """ + import numpy as np + import pandas as pd + + if isinstance(result, np.ndarray): + v_str = str(result.tolist()) + type_str = f"ndarray[{str(result.dtype)}]" + elif isinstance(result, pd.Series): + v_str = str(result.tolist()) + type_str = f"Series[{str(result.dtype)}]" + elif isinstance(result, pd.Categorical): + v_str = str(result.tolist()) + type_str = f"Categorical[{str(result.dtype)}]" + else: + v_str = str(result) + type_str = type(result).__name__ + + v_str = v_str.replace("\n", " ").replace("\r", " ").replace("\t", " ") + if max_len > 0: + v_str = v_str[:max_len] + return f"{v_str}@{type_str}" + + def _build_source_arrays(self): + """Build an ordered dict of named source PyArrow arrays for testing.""" + import pyarrow as pa + + sources = OrderedDict() + + # ===================================================================== + # Integer types + # ===================================================================== + for bits, pa_type in [ + (8, pa.int8()), + (16, pa.int16()), + (32, pa.int32()), + (64, pa.int64()), + ]: + max_val = 2 ** (bits - 1) - 1 + min_val = -(2 ** (bits - 1)) + sources[f"int{bits}:standard"] = pa.array( + [0, 1, -1, max_val, min_val], pa_type + ) + sources[f"int{bits}:nullable"] = pa.array([0, 1, None], pa_type) + + for bits, pa_type in [ + (8, pa.uint8()), + (16, pa.uint16()), + (32, pa.uint32()), + (64, pa.uint64()), + ]: + max_val = 2**bits - 1 + sources[f"uint{bits}:standard"] = pa.array([0, 1, max_val], pa_type) + sources[f"uint{bits}:nullable"] = pa.array([0, 1, None], pa_type) + + # ===================================================================== + # Float types + # ===================================================================== + sources["float32:standard"] = pa.array([0.0, 1.5, -1.5], pa.float32()) + sources["float32:nullable"] = pa.array([0.0, 1.5, None], pa.float32()) + sources["float64:standard"] = pa.array([0.0, 1.5, -1.5], pa.float64()) + sources["float64:nullable"] = pa.array([0.0, 1.5, None], pa.float64()) + sources["float64:special"] = pa.array( + [float("nan"), float("inf"), float("-inf")], pa.float64() + ) + + # ===================================================================== + # Boolean + # ===================================================================== + sources["bool:standard"] = pa.array([True, False, True], pa.bool_()) + sources["bool:nullable"] = pa.array([True, False, None], pa.bool_()) + + # ===================================================================== + # String types + # ===================================================================== + sources["string:standard"] = pa.array( + ["hello", "world", ""], pa.string() + ) + sources["string:nullable"] = pa.array( + ["hello", None, "world"], pa.string() + ) + sources["large_string:standard"] = pa.array( + ["hello", "world"], pa.large_string() + ) + sources["large_string:nullable"] = pa.array( + ["hello", None], pa.large_string() + ) + + # ===================================================================== + # Binary types + # ===================================================================== + sources["binary:standard"] = pa.array( + [b"hello", b"world"], pa.binary() + ) + sources["binary:nullable"] = pa.array([b"hello", None], pa.binary()) + sources["large_binary:standard"] = pa.array( + [b"hello", b"world"], pa.large_binary() + ) + sources["large_binary:nullable"] = pa.array( + [b"hello", None], pa.large_binary() + ) + + # ===================================================================== + # Decimal + # ===================================================================== + sources["decimal128:standard"] = pa.array( + [Decimal("1.23"), Decimal("4.56"), Decimal("-7.89")], + pa.decimal128(5, 2), + ) + sources["decimal128:nullable"] = pa.array( + [Decimal("1.23"), None, Decimal("4.56")], pa.decimal128(5, 2) + ) + + # ===================================================================== + # Date types + # ===================================================================== + d1 = datetime.date(2024, 1, 1) + d2 = datetime.date(2024, 6, 15) + sources["date32:standard"] = pa.array([d1, d2], pa.date32()) + sources["date32:nullable"] = pa.array([d1, None], pa.date32()) + sources["date64:standard"] = pa.array([d1, d2], pa.date64()) + sources["date64:nullable"] = pa.array([d1, None], pa.date64()) + + # ===================================================================== + # Timestamp types + # ===================================================================== + dt1 = datetime.datetime(2024, 1, 1, 12, 0, 0) + dt2 = datetime.datetime(2024, 6, 15, 18, 30, 0) + for unit in ["s", "ms", "us", "ns"]: + sources[f"timestamp[{unit}]:standard"] = pa.array( + [dt1, dt2], pa.timestamp(unit) + ) + sources[f"timestamp[{unit}]:nullable"] = pa.array( + [dt1, None], pa.timestamp(unit) + ) + # Timestamp with timezone + sources["timestamp[us,tz=UTC]:standard"] = pa.array( + [dt1, dt2], pa.timestamp("us", tz="UTC") + ) + sources["timestamp[us,tz=UTC]:nullable"] = pa.array( + [dt1, None], pa.timestamp("us", tz="UTC") + ) + + # ===================================================================== + # Duration types + # ===================================================================== + td1 = datetime.timedelta(days=1) + td2 = datetime.timedelta(hours=2, minutes=30) + for unit in ["s", "ms", "us", "ns"]: + sources[f"duration[{unit}]:standard"] = pa.array( + [td1, td2], pa.duration(unit) + ) + sources[f"duration[{unit}]:nullable"] = pa.array( + [td1, None], pa.duration(unit) + ) + + # ===================================================================== + # Time types + # ===================================================================== + t1 = datetime.time(12, 30, 0) + t2 = datetime.time(18, 45, 30) + sources["time32[s]:standard"] = pa.array([t1, t2], pa.time32("s")) + sources["time32[s]:nullable"] = pa.array([t1, None], pa.time32("s")) + sources["time32[ms]:standard"] = pa.array([t1, t2], pa.time32("ms")) + sources["time32[ms]:nullable"] = pa.array([t1, None], pa.time32("ms")) + sources["time64[us]:standard"] = pa.array([t1, t2], pa.time64("us")) + sources["time64[us]:nullable"] = pa.array([t1, None], pa.time64("us")) + sources["time64[ns]:standard"] = pa.array([t1, t2], pa.time64("ns")) + sources["time64[ns]:nullable"] = pa.array([t1, None], pa.time64("ns")) + + # ===================================================================== + # Null type + # ===================================================================== + sources["null:standard"] = pa.array([None, None, None], pa.null()) + + # ===================================================================== + # Nested types + # ===================================================================== + sources["list:standard"] = pa.array( + [[1, 2], [3, 4, 5]], pa.list_(pa.int64()) + ) + sources["list:nullable"] = pa.array( + [[1, 2], None, [3]], pa.list_(pa.int64()) + ) + sources["list:standard"] = pa.array( + [["a", "b"], ["c"]], pa.list_(pa.string()) + ) + sources["large_list:standard"] = pa.array( + [[1, 2], [3, 4]], pa.large_list(pa.int64()) + ) + sources["fixed_size_list[3]:standard"] = pa.array( + [[1, 2, 3], [4, 5, 6]], pa.list_(pa.int64(), 3) + ) + sources["struct:standard"] = pa.array( + [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], + pa.struct([("x", pa.int64()), ("y", pa.string())]), + ) + sources["struct:nullable"] = pa.array( + [{"x": 1, "y": "a"}, None], + pa.struct([("x", pa.int64()), ("y", pa.string())]), + ) + sources["map:standard"] = pa.array( + [[("a", 1), ("b", 2)], [("c", 3)]], + pa.map_(pa.string(), pa.int64()), + ) + + # ===================================================================== + # Dictionary type + # ===================================================================== + sources["dictionary:standard"] = ( + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 0, 1], pa.int32()), + pa.array(["a", "b"], pa.string()), + ) + ) + sources["dictionary:nullable"] = ( + pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0], pa.int32()), + pa.array(["a", "b"], pa.string()), + ) + ) + + return sources + + def test_to_pandas_default(self): + """Test pa.Array.to_pandas() with default arguments against golden file.""" + sources = self._build_source_arrays() + + generating = self.is_generating_golden() + test_dir = os.path.dirname(inspect.getfile(type(self))) + golden_csv = os.path.join( + test_dir, "golden_pyarrow_arrow_to_pandas_default.csv" + ) + golden_md = os.path.join( + test_dir, "golden_pyarrow_arrow_to_pandas_default.md" + ) + + golden = None + if not generating: + golden = self.load_golden_csv(golden_csv) + + errors = [] + results = OrderedDict() + + for name, arr in sources.items(): + try: + result = arr.to_pandas() + cell = self._repr_result(result, max_len=0) + except Exception as e: + cell = f"ERR@{type(e).__name__}" + results[name] = cell + + if not generating and golden is not None: + expected = golden.loc[name, "to_pandas()"] + if expected != cell: + errors.append( + f"{name}: expected '{expected}', got '{cell}'" + ) + + if generating: + import pandas as pd + + index = pd.Index(list(sources.keys()), name="source") + df = pd.DataFrame( + {"to_pandas()": [results[k] for k in sources]}, + index=index, + ) + self.save_golden(df, golden_csv, golden_md) + else: + self.assertEqual( + len(errors), + 0, + f"\n{len(errors)} golden file mismatches:\n" + + "\n".join(errors), + ) + + +if __name__ == "__main__": + from pyspark.testing import main + + main() From 3ffc406fbda684ad0af0e6fdb3d46b257feae14e Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:15:47 +0000 Subject: [PATCH 02/11] Move compare_or_generate_golden_matrix to GoldenFileTestMixin for reuse Co-authored-by: Isaac --- python/pyspark/testing/goldenutils.py | 82 ++++++++++++++++++- .../pyarrow/test_pyarrow_array_cast.py | 80 ------------------ .../test_pyarrow_arrow_to_pandas_default.py | 60 ++++---------- 3 files changed, 95 insertions(+), 127 deletions(-) diff --git a/python/pyspark/testing/goldenutils.py b/python/pyspark/testing/goldenutils.py index f18add2201db8..edadbdc77896f 100644 --- a/python/pyspark/testing/goldenutils.py +++ b/python/pyspark/testing/goldenutils.py @@ -15,7 +15,8 @@ # limitations under the License. # -from typing import Any, Optional +from typing import Any, Callable, List, Optional +import inspect import os import time @@ -350,6 +351,85 @@ def repr_value(cls, value: Any, max_len: int = 32) -> str: return cls.repr_python_value(value, max_len) + def compare_or_generate_golden_matrix( + self, + row_names: List[str], + col_names: List[str], + compute_cell: Callable[[str, str], str], + golden_file_prefix: str, + index_name: str = "source \\ target", + overrides: Optional[dict[tuple[str, str], str]] = None, + ) -> None: + """ + Run a matrix of computations and compare against (or generate) a golden file. + + 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a + DataFrame, and save it as the new golden CSV / Markdown file. + 2. Otherwise, load the existing golden file and assert that every cell + matches the freshly computed value. + + Parameters + ---------- + row_names : list[str] + Ordered row labels (becomes the DataFrame index). + col_names : list[str] + Ordered column labels. + compute_cell : (row_name, col_name) -> str + Function that computes the string result for one cell. + golden_file_prefix : str + Prefix for the golden CSV/MD files (without extension). + Files are placed in the same directory as the concrete test file. + index_name : str, default "source \\ target" + Name for the index column in the golden file. + overrides : dict[(row, col) -> str], optional + Version-specific expected values that take precedence over the golden + file. Use this to document known behavioral differences across + library versions (e.g. PyArrow 18 vs 22) directly in the test code, + so that the same golden file works for multiple versions. + """ + generating = self.is_generating_golden() + + test_dir = os.path.dirname(inspect.getfile(type(self))) + golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") + golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") + + golden = None + if not generating: + golden = self.load_golden_csv(golden_csv) + + errors = [] + results = {} + + for row_name in row_names: + for col_name in col_names: + result = compute_cell(row_name, col_name) + results[(row_name, col_name)] = result + + if not generating: + if overrides and (row_name, col_name) in overrides: + expected = overrides[(row_name, col_name)] + else: + expected = golden.loc[row_name, col_name] + if expected != result: + errors.append( + f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" + ) + + if generating: + import pandas as pd + + index = pd.Index(row_names, name=index_name) + df = pd.DataFrame(index=index) + for col_name in col_names: + df[col_name] = [results[(row, col_name)] for row in row_names] + self.save_golden(df, golden_csv, golden_md) + else: + self.assertEqual( + len(errors), + 0, + f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), + ) + @staticmethod def clean_result(result: str) -> str: """Clean result string by removing newlines and extra whitespace.""" diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py index 6c48e4e9461a9..ef041fd2cb476 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py @@ -55,12 +55,10 @@ | pa.array(floats, pa.float16()) natively | requires numpy | requires numpy | native | """ -import inspect import os import platform import unittest from decimal import Decimal -from typing import Callable, List, Optional from pyspark.loose_version import LooseVersion from pyspark.testing.utils import ( @@ -134,84 +132,6 @@ def _try_cast(self, src_arr, tgt_type, safe=True): except Exception as e: return f"ERR@{type(e).__name__}" - def compare_or_generate_golden_matrix( - self, - row_names: List[str], - col_names: List[str], - compute_cell: Callable[[str, str], str], - golden_file_prefix: str, - index_name: str = "source \\ target", - overrides: Optional[dict[tuple[str, str], str]] = None, - ) -> None: - """ - Run a matrix of computations and compare against (or generate) a golden file. - - 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a - DataFrame, and save it as the new golden CSV / Markdown file. - 2. Otherwise, load the existing golden file and assert that every cell - matches the freshly computed value. - - Parameters - ---------- - row_names : list[str] - Ordered row labels (becomes the DataFrame index). - col_names : list[str] - Ordered column labels. - compute_cell : (row_name, col_name) -> str - Function that computes the string result for one cell. - golden_file_prefix : str - Prefix for the golden CSV/MD files (without extension). - Files are placed in the same directory as the concrete test file. - index_name : str, default "source \\ target" - Name for the index column in the golden file. - overrides : dict[(row, col) -> str], optional - Version-specific expected values that take precedence over the golden - file. Use this to document known behavioral differences across - library versions (e.g. PyArrow 18 vs 22) directly in the test code, - so that the same golden file works for multiple versions. - """ - generating = self.is_generating_golden() - - test_dir = os.path.dirname(inspect.getfile(type(self))) - golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") - golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") - - golden = None - if not generating: - golden = self.load_golden_csv(golden_csv) - - errors = [] - results = {} - - for row_name in row_names: - for col_name in col_names: - result = compute_cell(row_name, col_name) - results[(row_name, col_name)] = result - - if not generating: - if overrides and (row_name, col_name) in overrides: - expected = overrides[(row_name, col_name)] - else: - expected = golden.loc[row_name, col_name] - if expected != result: - errors.append( - f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" - ) - - if generating: - import pandas as pd - - index = pd.Index(row_names, name=index_name) - df = pd.DataFrame(index=index) - for col_name in col_names: - df[col_name] = [results[(row, col_name)] for row in row_names] - self.save_golden(df, golden_csv, golden_md) - else: - self.assertEqual( - len(errors), - 0, - f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), - ) # ============================================================ diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index c921c7f171071..d35df256c5f7c 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -51,8 +51,6 @@ """ import datetime -import inspect -import os import unittest from collections import OrderedDict from decimal import Decimal @@ -320,54 +318,24 @@ def _build_source_arrays(self): def test_to_pandas_default(self): """Test pa.Array.to_pandas() with default arguments against golden file.""" sources = self._build_source_arrays() + row_names = list(sources.keys()) + col_names = ["to_pandas()"] - generating = self.is_generating_golden() - test_dir = os.path.dirname(inspect.getfile(type(self))) - golden_csv = os.path.join( - test_dir, "golden_pyarrow_arrow_to_pandas_default.csv" - ) - golden_md = os.path.join( - test_dir, "golden_pyarrow_arrow_to_pandas_default.md" - ) - - golden = None - if not generating: - golden = self.load_golden_csv(golden_csv) - - errors = [] - results = OrderedDict() - - for name, arr in sources.items(): + def compute_cell(row_name, col_name): + arr = sources[row_name] try: result = arr.to_pandas() - cell = self._repr_result(result, max_len=0) + return self._repr_result(result, max_len=0) except Exception as e: - cell = f"ERR@{type(e).__name__}" - results[name] = cell - - if not generating and golden is not None: - expected = golden.loc[name, "to_pandas()"] - if expected != cell: - errors.append( - f"{name}: expected '{expected}', got '{cell}'" - ) - - if generating: - import pandas as pd - - index = pd.Index(list(sources.keys()), name="source") - df = pd.DataFrame( - {"to_pandas()": [results[k] for k in sources]}, - index=index, - ) - self.save_golden(df, golden_csv, golden_md) - else: - self.assertEqual( - len(errors), - 0, - f"\n{len(errors)} golden file mismatches:\n" - + "\n".join(errors), - ) + return f"ERR@{type(e).__name__}" + + self.compare_or_generate_golden_matrix( + row_names=row_names, + col_names=col_names, + compute_cell=compute_cell, + golden_file_prefix="golden_pyarrow_arrow_to_pandas_default", + index_name="source", + ) if __name__ == "__main__": From 3034cdbaed95c20428de61740c4f4bb04f198b03 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:18:53 +0000 Subject: [PATCH 03/11] Add deeply nested type cases for arrow to pandas default test Add list, list, list, struct, struct, struct, map with list/struct/map values to cover deep nesting. Co-authored-by: Isaac --- ...golden_pyarrow_arrow_to_pandas_default.csv | 9 + .../golden_pyarrow_arrow_to_pandas_default.md | 161 +++++++++--------- .../test_pyarrow_arrow_to_pandas_default.py | 45 +++++ 3 files changed, 139 insertions(+), 76 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index 2fd187cd65920..d7b23fa11aedf 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -71,5 +71,14 @@ fixed_size_list[3]:standard [array([1, 2, 3]), array([4, 5, 6])]@Series[o struct:standard [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] struct:nullable [{'x': 1, 'y': 'a'}, None]@Series[object] map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] +list>:standard [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] +list:standard [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] +list>:standard [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] +struct:standard [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] +struct>:standard [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] +struct>:standard [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] +map>:standard [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] +map:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] +map>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] dictionary:standard ['a', 'b', 'a', 'b']@Series[category] dictionary:nullable ['a', 'b', nan, 'a']@Series[category] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index 9df631af77d6e..9c7f17c344d8e 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -1,76 +1,85 @@ -| source | to_pandas() | -|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| int8:standard | [0, 1, -1, 127, -128]@Series[int8] | -| int8:nullable | [0.0, 1.0, nan]@Series[float64] | -| int16:standard | [0, 1, -1, 32767, -32768]@Series[int16] | -| int16:nullable | [0.0, 1.0, nan]@Series[float64] | -| int32:standard | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | -| int32:nullable | [0.0, 1.0, nan]@Series[float64] | -| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | -| int64:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint8:standard | [0, 1, 255]@Series[uint8] | -| uint8:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint16:standard | [0, 1, 65535]@Series[uint16] | -| uint16:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint32:standard | [0, 1, 4294967295]@Series[uint32] | -| uint32:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint64:standard | [0, 1, 18446744073709551615]@Series[uint64] | -| uint64:nullable | [0.0, 1.0, nan]@Series[float64] | -| float32:standard | [0.0, 1.5, -1.5]@Series[float32] | -| float32:nullable | [0.0, 1.5, nan]@Series[float32] | -| float64:standard | [0.0, 1.5, -1.5]@Series[float64] | -| float64:nullable | [0.0, 1.5, nan]@Series[float64] | -| float64:special | [nan, inf, -inf]@Series[float64] | -| bool:standard | [True, False, True]@Series[bool] | -| bool:nullable | [True, False, None]@Series[object] | -| string:standard | ['hello', 'world', '']@Series[object] | -| string:nullable | ['hello', None, 'world']@Series[object] | -| large_string:standard | ['hello', 'world']@Series[object] | -| large_string:nullable | ['hello', None]@Series[object] | -| binary:standard | [b'hello', b'world']@Series[object] | -| binary:nullable | [b'hello', None]@Series[object] | -| large_binary:standard | [b'hello', b'world']@Series[object] | -| large_binary:nullable | [b'hello', None]@Series[object] | -| decimal128:standard | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | -| decimal128:nullable | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | -| date32:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | -| date32:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | -| date64:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | -| date64:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | -| timestamp[s]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | -| timestamp[s]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | -| timestamp[ms]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | -| timestamp[ms]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | -| timestamp[us]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | -| timestamp[us]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | -| timestamp[ns]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | -| timestamp[ns]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | -| timestamp[us,tz=UTC]:standard | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | -| timestamp[us,tz=UTC]:nullable | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | -| duration[s]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | -| duration[s]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | -| duration[ms]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | -| duration[ms]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | -| duration[us]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | -| duration[us]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | -| duration[ns]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | -| duration[ns]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | -| time32[s]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time32[s]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time32[ms]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time32[ms]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time64[us]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time64[us]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time64[ns]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time64[ns]:nullable | [datetime.time(12, 30), None]@Series[object] | -| null:standard | [None, None, None]@Series[object] | -| list:standard | [array([1, 2]), array([3, 4, 5])]@Series[object] | -| list:nullable | [array([1, 2]), None, array([3])]@Series[object] | -| list:standard | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | -| large_list:standard | [array([1, 2]), array([3, 4])]@Series[object] | -| fixed_size_list[3]:standard | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | -| struct:standard | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | -| struct:nullable | [{'x': 1, 'y': 'a'}, None]@Series[object] | -| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | -| dictionary:standard | ['a', 'b', 'a', 'b']@Series[category] | -| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | \ No newline at end of file +| source | to_pandas() | +|----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| int8:standard | [0, 1, -1, 127, -128]@Series[int8] | +| int8:nullable | [0.0, 1.0, nan]@Series[float64] | +| int16:standard | [0, 1, -1, 32767, -32768]@Series[int16] | +| int16:nullable | [0.0, 1.0, nan]@Series[float64] | +| int32:standard | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | +| int32:nullable | [0.0, 1.0, nan]@Series[float64] | +| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | +| int64:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint8:standard | [0, 1, 255]@Series[uint8] | +| uint8:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint16:standard | [0, 1, 65535]@Series[uint16] | +| uint16:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint32:standard | [0, 1, 4294967295]@Series[uint32] | +| uint32:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint64:standard | [0, 1, 18446744073709551615]@Series[uint64] | +| uint64:nullable | [0.0, 1.0, nan]@Series[float64] | +| float32:standard | [0.0, 1.5, -1.5]@Series[float32] | +| float32:nullable | [0.0, 1.5, nan]@Series[float32] | +| float64:standard | [0.0, 1.5, -1.5]@Series[float64] | +| float64:nullable | [0.0, 1.5, nan]@Series[float64] | +| float64:special | [nan, inf, -inf]@Series[float64] | +| bool:standard | [True, False, True]@Series[bool] | +| bool:nullable | [True, False, None]@Series[object] | +| string:standard | ['hello', 'world', '']@Series[object] | +| string:nullable | ['hello', None, 'world']@Series[object] | +| large_string:standard | ['hello', 'world']@Series[object] | +| large_string:nullable | ['hello', None]@Series[object] | +| binary:standard | [b'hello', b'world']@Series[object] | +| binary:nullable | [b'hello', None]@Series[object] | +| large_binary:standard | [b'hello', b'world']@Series[object] | +| large_binary:nullable | [b'hello', None]@Series[object] | +| decimal128:standard | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | +| decimal128:nullable | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | +| date32:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date32:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| date64:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date64:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| timestamp[s]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | +| timestamp[s]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | +| timestamp[ms]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | +| timestamp[ms]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | +| timestamp[us]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | +| timestamp[us]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | +| timestamp[ns]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | +| timestamp[ns]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | +| timestamp[us,tz=UTC]:standard | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:nullable | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | +| duration[s]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | +| duration[s]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | +| duration[ms]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | +| duration[ms]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | +| duration[us]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | +| duration[us]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | +| duration[ns]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | +| duration[ns]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | +| time32[s]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[s]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time32[ms]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[ms]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[us]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[us]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[ns]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[ns]:nullable | [datetime.time(12, 30), None]@Series[object] | +| null:standard | [None, None, None]@Series[object] | +| list:standard | [array([1, 2]), array([3, 4, 5])]@Series[object] | +| list:nullable | [array([1, 2]), None, array([3])]@Series[object] | +| list:standard | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | +| large_list:standard | [array([1, 2]), array([3, 4])]@Series[object] | +| fixed_size_list[3]:standard | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | +| struct:standard | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | +| struct:nullable | [{'x': 1, 'y': 'a'}, None]@Series[object] | +| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | +| list>:standard | [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] | +| list:standard | [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] | +| list>:standard | [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] | +| struct:standard | [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] | +| struct>:standard | [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] | +| struct>:standard | [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] | +| map>:standard | [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] | +| map:standard | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] | +| map>:standard | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] | +| dictionary:standard | ['a', 'b', 'a', 'b']@Series[category] | +| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | \ No newline at end of file diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index d35df256c5f7c..c92f0e3e86138 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -296,6 +296,51 @@ def _build_source_arrays(self): [[("a", 1), ("b", 2)], [("c", 3)]], pa.map_(pa.string(), pa.int64()), ) + # list of list (nested list) + sources["list>:standard"] = pa.array( + [[[1, 2], [3]], [[4, 5, 6]]], + pa.list_(pa.list_(pa.int64())), + ) + # list of struct + sources["list:standard"] = pa.array( + [[{"x": 1}, {"x": 2}], [{"x": 3}]], + pa.list_(pa.struct([("x", pa.int64())])), + ) + # list of map + sources["list>:standard"] = pa.array( + [[[("a", 1)], [("b", 2)]], [[("c", 3)]]], + pa.list_(pa.map_(pa.string(), pa.int64())), + ) + # struct of struct + sources["struct:standard"] = pa.array( + [{"outer": {"inner": 1}}, {"outer": {"inner": 2}}], + pa.struct([("outer", pa.struct([("inner", pa.int64())]))]), + ) + # struct of list + sources["struct>:standard"] = pa.array( + [{"items": [1, 2, 3]}, {"items": [4, 5]}], + pa.struct([("items", pa.list_(pa.int64()))]), + ) + # struct of map + sources["struct>:standard"] = pa.array( + [{"mapping": [("a", 1)]}, {"mapping": [("b", 2)]}], + pa.struct([("mapping", pa.map_(pa.string(), pa.int64()))]), + ) + # map with list values + sources["map>:standard"] = pa.array( + [[("a", [1, 2]), ("b", [3])], [("c", [4, 5, 6])]], + pa.map_(pa.string(), pa.list_(pa.int64())), + ) + # map with struct values + sources["map:standard"] = pa.array( + [[("a", {"v": 1}), ("b", {"v": 2})], [("c", {"v": 3})]], + pa.map_(pa.string(), pa.struct([("v", pa.int64())])), + ) + # map of map (map with map values) + sources["map>:standard"] = pa.array( + [[("a", [("x", 1)]), ("b", [("y", 2)])], [("c", [("z", 3)])]], + pa.map_(pa.string(), pa.map_(pa.string(), pa.int64())), + ) # ===================================================================== # Dictionary type From 019b08ecd9c10fa11781d8b1c66bffc6ba3d7664 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:23:03 +0000 Subject: [PATCH 04/11] Add empty array cases for all data types in arrow to pandas default test Co-authored-by: Isaac --- ...golden_pyarrow_arrow_to_pandas_default.csv | 38 ++++++++++++++++ .../golden_pyarrow_arrow_to_pandas_default.md | 40 ++++++++++++++++- .../test_pyarrow_arrow_to_pandas_default.py | 45 +++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index d7b23fa11aedf..25c5a061b41d5 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -1,76 +1,113 @@ source to_pandas() int8:standard [0, 1, -1, 127, -128]@Series[int8] int8:nullable [0.0, 1.0, nan]@Series[float64] +int8:empty []@Series[int8] int16:standard [0, 1, -1, 32767, -32768]@Series[int16] int16:nullable [0.0, 1.0, nan]@Series[float64] +int16:empty []@Series[int16] int32:standard [0, 1, -1, 2147483647, -2147483648]@Series[int32] int32:nullable [0.0, 1.0, nan]@Series[float64] +int32:empty []@Series[int32] int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] int64:nullable [0.0, 1.0, nan]@Series[float64] +int64:empty []@Series[int64] uint8:standard [0, 1, 255]@Series[uint8] uint8:nullable [0.0, 1.0, nan]@Series[float64] +uint8:empty []@Series[uint8] uint16:standard [0, 1, 65535]@Series[uint16] uint16:nullable [0.0, 1.0, nan]@Series[float64] +uint16:empty []@Series[uint16] uint32:standard [0, 1, 4294967295]@Series[uint32] uint32:nullable [0.0, 1.0, nan]@Series[float64] +uint32:empty []@Series[uint32] uint64:standard [0, 1, 18446744073709551615]@Series[uint64] uint64:nullable [0.0, 1.0, nan]@Series[float64] +uint64:empty []@Series[uint64] float32:standard [0.0, 1.5, -1.5]@Series[float32] float32:nullable [0.0, 1.5, nan]@Series[float32] +float32:empty []@Series[float32] float64:standard [0.0, 1.5, -1.5]@Series[float64] float64:nullable [0.0, 1.5, nan]@Series[float64] float64:special [nan, inf, -inf]@Series[float64] +float64:empty []@Series[float64] bool:standard [True, False, True]@Series[bool] bool:nullable [True, False, None]@Series[object] +bool:empty []@Series[bool] string:standard ['hello', 'world', '']@Series[object] string:nullable ['hello', None, 'world']@Series[object] +string:empty []@Series[object] large_string:standard ['hello', 'world']@Series[object] large_string:nullable ['hello', None]@Series[object] +large_string:empty []@Series[object] binary:standard [b'hello', b'world']@Series[object] binary:nullable [b'hello', None]@Series[object] +binary:empty []@Series[object] large_binary:standard [b'hello', b'world']@Series[object] large_binary:nullable [b'hello', None]@Series[object] +large_binary:empty []@Series[object] decimal128:standard [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] decimal128:nullable [Decimal('1.23'), None, Decimal('4.56')]@Series[object] +decimal128:empty []@Series[object] date32:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] date32:nullable [datetime.date(2024, 1, 1), None]@Series[object] +date32:empty []@Series[object] date64:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] date64:nullable [datetime.date(2024, 1, 1), None]@Series[object] +date64:empty []@Series[object] timestamp[s]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] timestamp[s]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] +timestamp[s]:empty []@Series[datetime64[s]] timestamp[ms]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] timestamp[ms]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] +timestamp[ms]:empty []@Series[datetime64[ms]] timestamp[us]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] timestamp[us]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] +timestamp[us]:empty []@Series[datetime64[us]] timestamp[ns]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] timestamp[ns]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] +timestamp[ns]:empty []@Series[datetime64[ns]] timestamp[us,tz=UTC]:standard [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] timestamp[us,tz=UTC]:nullable [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:empty []@Series[datetime64[us, UTC]] duration[s]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] duration[s]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] +duration[s]:empty []@Series[timedelta64[s]] duration[ms]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] duration[ms]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] +duration[ms]:empty []@Series[timedelta64[ms]] duration[us]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] duration[us]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] +duration[us]:empty []@Series[timedelta64[us]] duration[ns]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] duration[ns]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] +duration[ns]:empty []@Series[timedelta64[ns]] time32[s]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] time32[s]:nullable [datetime.time(12, 30), None]@Series[object] +time32[s]:empty []@Series[object] time32[ms]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] time32[ms]:nullable [datetime.time(12, 30), None]@Series[object] +time32[ms]:empty []@Series[object] time64[us]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] time64[us]:nullable [datetime.time(12, 30), None]@Series[object] +time64[us]:empty []@Series[object] time64[ns]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] time64[ns]:nullable [datetime.time(12, 30), None]@Series[object] +time64[ns]:empty []@Series[object] null:standard [None, None, None]@Series[object] +null:empty []@Series[object] list:standard [array([1, 2]), array([3, 4, 5])]@Series[object] list:nullable [array([1, 2]), None, array([3])]@Series[object] +list:empty []@Series[object] list:standard [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] large_list:standard [array([1, 2]), array([3, 4])]@Series[object] +large_list:empty []@Series[object] fixed_size_list[3]:standard [array([1, 2, 3]), array([4, 5, 6])]@Series[object] +fixed_size_list[3]:empty []@Series[object] struct:standard [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] struct:nullable [{'x': 1, 'y': 'a'}, None]@Series[object] +struct:empty []@Series[object] map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] +map:empty []@Series[object] list>:standard [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] list:standard [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] list>:standard [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] @@ -82,3 +119,4 @@ map:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3} map>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] dictionary:standard ['a', 'b', 'a', 'b']@Series[category] dictionary:nullable ['a', 'b', nan, 'a']@Series[category] +dictionary:empty []@Series[category] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index 9c7f17c344d8e..1c9e207b92dcf 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -2,76 +2,113 @@ |----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | int8:standard | [0, 1, -1, 127, -128]@Series[int8] | | int8:nullable | [0.0, 1.0, nan]@Series[float64] | +| int8:empty | []@Series[int8] | | int16:standard | [0, 1, -1, 32767, -32768]@Series[int16] | | int16:nullable | [0.0, 1.0, nan]@Series[float64] | +| int16:empty | []@Series[int16] | | int32:standard | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | | int32:nullable | [0.0, 1.0, nan]@Series[float64] | +| int32:empty | []@Series[int32] | | int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | | int64:nullable | [0.0, 1.0, nan]@Series[float64] | +| int64:empty | []@Series[int64] | | uint8:standard | [0, 1, 255]@Series[uint8] | | uint8:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint8:empty | []@Series[uint8] | | uint16:standard | [0, 1, 65535]@Series[uint16] | | uint16:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint16:empty | []@Series[uint16] | | uint32:standard | [0, 1, 4294967295]@Series[uint32] | | uint32:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint32:empty | []@Series[uint32] | | uint64:standard | [0, 1, 18446744073709551615]@Series[uint64] | | uint64:nullable | [0.0, 1.0, nan]@Series[float64] | +| uint64:empty | []@Series[uint64] | | float32:standard | [0.0, 1.5, -1.5]@Series[float32] | | float32:nullable | [0.0, 1.5, nan]@Series[float32] | +| float32:empty | []@Series[float32] | | float64:standard | [0.0, 1.5, -1.5]@Series[float64] | | float64:nullable | [0.0, 1.5, nan]@Series[float64] | | float64:special | [nan, inf, -inf]@Series[float64] | +| float64:empty | []@Series[float64] | | bool:standard | [True, False, True]@Series[bool] | | bool:nullable | [True, False, None]@Series[object] | +| bool:empty | []@Series[bool] | | string:standard | ['hello', 'world', '']@Series[object] | | string:nullable | ['hello', None, 'world']@Series[object] | +| string:empty | []@Series[object] | | large_string:standard | ['hello', 'world']@Series[object] | | large_string:nullable | ['hello', None]@Series[object] | +| large_string:empty | []@Series[object] | | binary:standard | [b'hello', b'world']@Series[object] | | binary:nullable | [b'hello', None]@Series[object] | +| binary:empty | []@Series[object] | | large_binary:standard | [b'hello', b'world']@Series[object] | | large_binary:nullable | [b'hello', None]@Series[object] | +| large_binary:empty | []@Series[object] | | decimal128:standard | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | | decimal128:nullable | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | +| decimal128:empty | []@Series[object] | | date32:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | | date32:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| date32:empty | []@Series[object] | | date64:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | | date64:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | +| date64:empty | []@Series[object] | | timestamp[s]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | | timestamp[s]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | +| timestamp[s]:empty | []@Series[datetime64[s]] | | timestamp[ms]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | | timestamp[ms]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | +| timestamp[ms]:empty | []@Series[datetime64[ms]] | | timestamp[us]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | | timestamp[us]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | +| timestamp[us]:empty | []@Series[datetime64[us]] | | timestamp[ns]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | | timestamp[ns]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | +| timestamp[ns]:empty | []@Series[datetime64[ns]] | | timestamp[us,tz=UTC]:standard | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | | timestamp[us,tz=UTC]:nullable | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:empty | []@Series[datetime64[us, UTC]] | | duration[s]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | | duration[s]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | +| duration[s]:empty | []@Series[timedelta64[s]] | | duration[ms]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | | duration[ms]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | +| duration[ms]:empty | []@Series[timedelta64[ms]] | | duration[us]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | | duration[us]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | +| duration[us]:empty | []@Series[timedelta64[us]] | | duration[ns]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | | duration[ns]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | +| duration[ns]:empty | []@Series[timedelta64[ns]] | | time32[s]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | | time32[s]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time32[s]:empty | []@Series[object] | | time32[ms]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | | time32[ms]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time32[ms]:empty | []@Series[object] | | time64[us]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | | time64[us]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[us]:empty | []@Series[object] | | time64[ns]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | | time64[ns]:nullable | [datetime.time(12, 30), None]@Series[object] | +| time64[ns]:empty | []@Series[object] | | null:standard | [None, None, None]@Series[object] | +| null:empty | []@Series[object] | | list:standard | [array([1, 2]), array([3, 4, 5])]@Series[object] | | list:nullable | [array([1, 2]), None, array([3])]@Series[object] | +| list:empty | []@Series[object] | | list:standard | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | | large_list:standard | [array([1, 2]), array([3, 4])]@Series[object] | +| large_list:empty | []@Series[object] | | fixed_size_list[3]:standard | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | +| fixed_size_list[3]:empty | []@Series[object] | | struct:standard | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | | struct:nullable | [{'x': 1, 'y': 'a'}, None]@Series[object] | +| struct:empty | []@Series[object] | | map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | +| map:empty | []@Series[object] | | list>:standard | [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] | | list:standard | [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] | | list>:standard | [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] | @@ -82,4 +119,5 @@ | map:standard | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] | | map>:standard | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] | | dictionary:standard | ['a', 'b', 'a', 'b']@Series[category] | -| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | \ No newline at end of file +| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | +| dictionary:empty | []@Series[category] | \ No newline at end of file diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index c92f0e3e86138..efe01ecd6d233 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -135,6 +135,7 @@ def _build_source_arrays(self): [0, 1, -1, max_val, min_val], pa_type ) sources[f"int{bits}:nullable"] = pa.array([0, 1, None], pa_type) + sources[f"int{bits}:empty"] = pa.array([], pa_type) for bits, pa_type in [ (8, pa.uint8()), @@ -145,23 +146,27 @@ def _build_source_arrays(self): max_val = 2**bits - 1 sources[f"uint{bits}:standard"] = pa.array([0, 1, max_val], pa_type) sources[f"uint{bits}:nullable"] = pa.array([0, 1, None], pa_type) + sources[f"uint{bits}:empty"] = pa.array([], pa_type) # ===================================================================== # Float types # ===================================================================== sources["float32:standard"] = pa.array([0.0, 1.5, -1.5], pa.float32()) sources["float32:nullable"] = pa.array([0.0, 1.5, None], pa.float32()) + sources["float32:empty"] = pa.array([], pa.float32()) sources["float64:standard"] = pa.array([0.0, 1.5, -1.5], pa.float64()) sources["float64:nullable"] = pa.array([0.0, 1.5, None], pa.float64()) sources["float64:special"] = pa.array( [float("nan"), float("inf"), float("-inf")], pa.float64() ) + sources["float64:empty"] = pa.array([], pa.float64()) # ===================================================================== # Boolean # ===================================================================== sources["bool:standard"] = pa.array([True, False, True], pa.bool_()) sources["bool:nullable"] = pa.array([True, False, None], pa.bool_()) + sources["bool:empty"] = pa.array([], pa.bool_()) # ===================================================================== # String types @@ -172,12 +177,14 @@ def _build_source_arrays(self): sources["string:nullable"] = pa.array( ["hello", None, "world"], pa.string() ) + sources["string:empty"] = pa.array([], pa.string()) sources["large_string:standard"] = pa.array( ["hello", "world"], pa.large_string() ) sources["large_string:nullable"] = pa.array( ["hello", None], pa.large_string() ) + sources["large_string:empty"] = pa.array([], pa.large_string()) # ===================================================================== # Binary types @@ -186,12 +193,14 @@ def _build_source_arrays(self): [b"hello", b"world"], pa.binary() ) sources["binary:nullable"] = pa.array([b"hello", None], pa.binary()) + sources["binary:empty"] = pa.array([], pa.binary()) sources["large_binary:standard"] = pa.array( [b"hello", b"world"], pa.large_binary() ) sources["large_binary:nullable"] = pa.array( [b"hello", None], pa.large_binary() ) + sources["large_binary:empty"] = pa.array([], pa.large_binary()) # ===================================================================== # Decimal @@ -203,6 +212,7 @@ def _build_source_arrays(self): sources["decimal128:nullable"] = pa.array( [Decimal("1.23"), None, Decimal("4.56")], pa.decimal128(5, 2) ) + sources["decimal128:empty"] = pa.array([], pa.decimal128(5, 2)) # ===================================================================== # Date types @@ -211,8 +221,10 @@ def _build_source_arrays(self): d2 = datetime.date(2024, 6, 15) sources["date32:standard"] = pa.array([d1, d2], pa.date32()) sources["date32:nullable"] = pa.array([d1, None], pa.date32()) + sources["date32:empty"] = pa.array([], pa.date32()) sources["date64:standard"] = pa.array([d1, d2], pa.date64()) sources["date64:nullable"] = pa.array([d1, None], pa.date64()) + sources["date64:empty"] = pa.array([], pa.date64()) # ===================================================================== # Timestamp types @@ -226,6 +238,9 @@ def _build_source_arrays(self): sources[f"timestamp[{unit}]:nullable"] = pa.array( [dt1, None], pa.timestamp(unit) ) + sources[f"timestamp[{unit}]:empty"] = pa.array( + [], pa.timestamp(unit) + ) # Timestamp with timezone sources["timestamp[us,tz=UTC]:standard"] = pa.array( [dt1, dt2], pa.timestamp("us", tz="UTC") @@ -233,6 +248,9 @@ def _build_source_arrays(self): sources["timestamp[us,tz=UTC]:nullable"] = pa.array( [dt1, None], pa.timestamp("us", tz="UTC") ) + sources["timestamp[us,tz=UTC]:empty"] = pa.array( + [], pa.timestamp("us", tz="UTC") + ) # ===================================================================== # Duration types @@ -246,6 +264,9 @@ def _build_source_arrays(self): sources[f"duration[{unit}]:nullable"] = pa.array( [td1, None], pa.duration(unit) ) + sources[f"duration[{unit}]:empty"] = pa.array( + [], pa.duration(unit) + ) # ===================================================================== # Time types @@ -254,17 +275,22 @@ def _build_source_arrays(self): t2 = datetime.time(18, 45, 30) sources["time32[s]:standard"] = pa.array([t1, t2], pa.time32("s")) sources["time32[s]:nullable"] = pa.array([t1, None], pa.time32("s")) + sources["time32[s]:empty"] = pa.array([], pa.time32("s")) sources["time32[ms]:standard"] = pa.array([t1, t2], pa.time32("ms")) sources["time32[ms]:nullable"] = pa.array([t1, None], pa.time32("ms")) + sources["time32[ms]:empty"] = pa.array([], pa.time32("ms")) sources["time64[us]:standard"] = pa.array([t1, t2], pa.time64("us")) sources["time64[us]:nullable"] = pa.array([t1, None], pa.time64("us")) + sources["time64[us]:empty"] = pa.array([], pa.time64("us")) sources["time64[ns]:standard"] = pa.array([t1, t2], pa.time64("ns")) sources["time64[ns]:nullable"] = pa.array([t1, None], pa.time64("ns")) + sources["time64[ns]:empty"] = pa.array([], pa.time64("ns")) # ===================================================================== # Null type # ===================================================================== sources["null:standard"] = pa.array([None, None, None], pa.null()) + sources["null:empty"] = pa.array([], pa.null()) # ===================================================================== # Nested types @@ -275,15 +301,22 @@ def _build_source_arrays(self): sources["list:nullable"] = pa.array( [[1, 2], None, [3]], pa.list_(pa.int64()) ) + sources["list:empty"] = pa.array([], pa.list_(pa.int64())) sources["list:standard"] = pa.array( [["a", "b"], ["c"]], pa.list_(pa.string()) ) sources["large_list:standard"] = pa.array( [[1, 2], [3, 4]], pa.large_list(pa.int64()) ) + sources["large_list:empty"] = pa.array( + [], pa.large_list(pa.int64()) + ) sources["fixed_size_list[3]:standard"] = pa.array( [[1, 2, 3], [4, 5, 6]], pa.list_(pa.int64(), 3) ) + sources["fixed_size_list[3]:empty"] = pa.array( + [], pa.list_(pa.int64(), 3) + ) sources["struct:standard"] = pa.array( [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], pa.struct([("x", pa.int64()), ("y", pa.string())]), @@ -292,10 +325,16 @@ def _build_source_arrays(self): [{"x": 1, "y": "a"}, None], pa.struct([("x", pa.int64()), ("y", pa.string())]), ) + sources["struct:empty"] = pa.array( + [], pa.struct([("x", pa.int64()), ("y", pa.string())]) + ) sources["map:standard"] = pa.array( [[("a", 1), ("b", 2)], [("c", 3)]], pa.map_(pa.string(), pa.int64()), ) + sources["map:empty"] = pa.array( + [], pa.map_(pa.string(), pa.int64()) + ) # list of list (nested list) sources["list>:standard"] = pa.array( [[[1, 2], [3]], [[4, 5, 6]]], @@ -357,6 +396,12 @@ def _build_source_arrays(self): pa.array(["a", "b"], pa.string()), ) ) + sources["dictionary:empty"] = ( + pa.DictionaryArray.from_arrays( + pa.array([], pa.int32()), + pa.array([], pa.string()), + ) + ) return sources From e9c770c1b33011ca12802f7f15f148efa09f2593 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:30:07 +0000 Subject: [PATCH 05/11] Keep compare_or_generate_golden_matrix in test files; reuse repr_value Move compare_or_generate_golden_matrix back to test files instead of the shared mixin. Add repr_pandas_series_value to GoldenFileTestMixin so repr_value handles pd.Series, removing the need for a custom _repr_result. Co-authored-by: Isaac --- python/pyspark/testing/goldenutils.py | 95 ++++--------------- .../pyarrow/test_pyarrow_array_cast.py | 80 ++++++++++++++++ .../test_pyarrow_arrow_to_pandas_default.py | 90 ++++++++++++------ 3 files changed, 160 insertions(+), 105 deletions(-) diff --git a/python/pyspark/testing/goldenutils.py b/python/pyspark/testing/goldenutils.py index edadbdc77896f..b1e4af599cd16 100644 --- a/python/pyspark/testing/goldenutils.py +++ b/python/pyspark/testing/goldenutils.py @@ -15,8 +15,7 @@ # limitations under the License. # -from typing import Any, Callable, List, Optional -import inspect +from typing import Any, Optional import os import time @@ -346,89 +345,37 @@ def repr_value(cls, value: Any, max_len: int = 32) -> str: if have_pandas and isinstance(value, pd.DataFrame): return cls.repr_pandas_value(value, max_len) + if have_pandas and isinstance(value, pd.Series): + return cls.repr_pandas_series_value(value, max_len) if have_numpy and isinstance(value, np.ndarray): return cls.repr_numpy_value(value, max_len) return cls.repr_python_value(value, max_len) - def compare_or_generate_golden_matrix( - self, - row_names: List[str], - col_names: List[str], - compute_cell: Callable[[str, str], str], - golden_file_prefix: str, - index_name: str = "source \\ target", - overrides: Optional[dict[tuple[str, str], str]] = None, - ) -> None: + @classmethod + def repr_pandas_series_value(cls, value: Any, max_len: int = 32) -> str: """ - Run a matrix of computations and compare against (or generate) a golden file. + Format a pandas Series for golden file. - 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a - DataFrame, and save it as the new golden CSV / Markdown file. - 2. Otherwise, load the existing golden file and assert that every cell - matches the freshly computed value. + Uses tolist() for stable Python-native representation that does not + depend on numpy's string formatting, which can vary across versions. Parameters ---------- - row_names : list[str] - Ordered row labels (becomes the DataFrame index). - col_names : list[str] - Ordered column labels. - compute_cell : (row_name, col_name) -> str - Function that computes the string result for one cell. - golden_file_prefix : str - Prefix for the golden CSV/MD files (without extension). - Files are placed in the same directory as the concrete test file. - index_name : str, default "source \\ target" - Name for the index column in the golden file. - overrides : dict[(row, col) -> str], optional - Version-specific expected values that take precedence over the golden - file. Use this to document known behavioral differences across - library versions (e.g. PyArrow 18 vs 22) directly in the test code, - so that the same golden file works for multiple versions. + value : pd.Series + The pandas Series to represent. + max_len : int, default 32 + Maximum length for the value string portion. 0 means no limit. + + Returns + ------- + str + "python_list_repr@Series[dtype]" """ - generating = self.is_generating_golden() - - test_dir = os.path.dirname(inspect.getfile(type(self))) - golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") - golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") - - golden = None - if not generating: - golden = self.load_golden_csv(golden_csv) - - errors = [] - results = {} - - for row_name in row_names: - for col_name in col_names: - result = compute_cell(row_name, col_name) - results[(row_name, col_name)] = result - - if not generating: - if overrides and (row_name, col_name) in overrides: - expected = overrides[(row_name, col_name)] - else: - expected = golden.loc[row_name, col_name] - if expected != result: - errors.append( - f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" - ) - - if generating: - import pandas as pd - - index = pd.Index(row_names, name=index_name) - df = pd.DataFrame(index=index) - for col_name in col_names: - df[col_name] = [results[(row, col_name)] for row in row_names] - self.save_golden(df, golden_csv, golden_md) - else: - self.assertEqual( - len(errors), - 0, - f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), - ) + v_str = str(value.tolist()).replace("\n", " ") + if max_len > 0: + v_str = v_str[:max_len] + return f"{v_str}@Series[{str(value.dtype)}]" @staticmethod def clean_result(result: str) -> str: diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py index ef041fd2cb476..6c48e4e9461a9 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py @@ -55,10 +55,12 @@ | pa.array(floats, pa.float16()) natively | requires numpy | requires numpy | native | """ +import inspect import os import platform import unittest from decimal import Decimal +from typing import Callable, List, Optional from pyspark.loose_version import LooseVersion from pyspark.testing.utils import ( @@ -132,6 +134,84 @@ def _try_cast(self, src_arr, tgt_type, safe=True): except Exception as e: return f"ERR@{type(e).__name__}" + def compare_or_generate_golden_matrix( + self, + row_names: List[str], + col_names: List[str], + compute_cell: Callable[[str, str], str], + golden_file_prefix: str, + index_name: str = "source \\ target", + overrides: Optional[dict[tuple[str, str], str]] = None, + ) -> None: + """ + Run a matrix of computations and compare against (or generate) a golden file. + + 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a + DataFrame, and save it as the new golden CSV / Markdown file. + 2. Otherwise, load the existing golden file and assert that every cell + matches the freshly computed value. + + Parameters + ---------- + row_names : list[str] + Ordered row labels (becomes the DataFrame index). + col_names : list[str] + Ordered column labels. + compute_cell : (row_name, col_name) -> str + Function that computes the string result for one cell. + golden_file_prefix : str + Prefix for the golden CSV/MD files (without extension). + Files are placed in the same directory as the concrete test file. + index_name : str, default "source \\ target" + Name for the index column in the golden file. + overrides : dict[(row, col) -> str], optional + Version-specific expected values that take precedence over the golden + file. Use this to document known behavioral differences across + library versions (e.g. PyArrow 18 vs 22) directly in the test code, + so that the same golden file works for multiple versions. + """ + generating = self.is_generating_golden() + + test_dir = os.path.dirname(inspect.getfile(type(self))) + golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") + golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") + + golden = None + if not generating: + golden = self.load_golden_csv(golden_csv) + + errors = [] + results = {} + + for row_name in row_names: + for col_name in col_names: + result = compute_cell(row_name, col_name) + results[(row_name, col_name)] = result + + if not generating: + if overrides and (row_name, col_name) in overrides: + expected = overrides[(row_name, col_name)] + else: + expected = golden.loc[row_name, col_name] + if expected != result: + errors.append( + f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" + ) + + if generating: + import pandas as pd + + index = pd.Index(row_names, name=index_name) + df = pd.DataFrame(index=index) + for col_name in col_names: + df[col_name] = [results[(row, col_name)] for row in row_names] + self.save_golden(df, golden_csv, golden_md) + else: + self.assertEqual( + len(errors), + 0, + f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), + ) # ============================================================ diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index efe01ecd6d233..89d8b035978cd 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -51,9 +51,12 @@ """ import datetime +import inspect +import os import unittest from collections import OrderedDict from decimal import Decimal +from typing import Callable, List, Optional from pyspark.testing.utils import ( have_pyarrow, @@ -79,40 +82,65 @@ class PyArrowArrayToPandasDefaultTests(GoldenFileTestMixin, unittest.TestCase): Each type is tested both without and with null values. """ - @staticmethod - def _repr_result(result, max_len=0): + def compare_or_generate_golden_matrix( + self, + row_names: List[str], + col_names: List[str], + compute_cell: Callable[[str, str], str], + golden_file_prefix: str, + index_name: str = "source \\ target", + overrides: Optional[dict[tuple[str, str], str]] = None, + ) -> None: """ - Format to_pandas() result for golden file comparison. + Run a matrix of computations and compare against (or generate) a golden file. - Uses tolist() for stable Python-native value representation that does - not depend on numpy's string formatting, which can vary across versions. - - Returns - ------- - str - "python_list_repr@result_class[dtype]" - e.g. "[0, 1, -1, 127, -128]@ndarray[int8]" + 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a + DataFrame, and save it as the new golden CSV / Markdown file. + 2. Otherwise, load the existing golden file and assert that every cell + matches the freshly computed value. """ - import numpy as np - import pandas as pd - - if isinstance(result, np.ndarray): - v_str = str(result.tolist()) - type_str = f"ndarray[{str(result.dtype)}]" - elif isinstance(result, pd.Series): - v_str = str(result.tolist()) - type_str = f"Series[{str(result.dtype)}]" - elif isinstance(result, pd.Categorical): - v_str = str(result.tolist()) - type_str = f"Categorical[{str(result.dtype)}]" + generating = self.is_generating_golden() + + test_dir = os.path.dirname(inspect.getfile(type(self))) + golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") + golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") + + golden = None + if not generating: + golden = self.load_golden_csv(golden_csv) + + errors = [] + results = {} + + for row_name in row_names: + for col_name in col_names: + result = compute_cell(row_name, col_name) + results[(row_name, col_name)] = result + + if not generating: + if overrides and (row_name, col_name) in overrides: + expected = overrides[(row_name, col_name)] + else: + expected = golden.loc[row_name, col_name] + if expected != result: + errors.append( + f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" + ) + + if generating: + import pandas as pd + + index = pd.Index(row_names, name=index_name) + df = pd.DataFrame(index=index) + for col_name in col_names: + df[col_name] = [results[(row, col_name)] for row in row_names] + self.save_golden(df, golden_csv, golden_md) else: - v_str = str(result) - type_str = type(result).__name__ - - v_str = v_str.replace("\n", " ").replace("\r", " ").replace("\t", " ") - if max_len > 0: - v_str = v_str[:max_len] - return f"{v_str}@{type_str}" + self.assertEqual( + len(errors), + 0, + f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), + ) def _build_source_arrays(self): """Build an ordered dict of named source PyArrow arrays for testing.""" @@ -415,7 +443,7 @@ def compute_cell(row_name, col_name): arr = sources[row_name] try: result = arr.to_pandas() - return self._repr_result(result, max_len=0) + return self.repr_value(result, max_len=0) except Exception as e: return f"ERR@{type(e).__name__}" From c4eca54bcefdf168f6c3a26be30481f2a86af96c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:39:30 +0000 Subject: [PATCH 06/11] Register test in modules.py; replace OrderedDict with dict Co-authored-by: Isaac --- dev/sparktestsupport/modules.py | 1 + .../upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 070d5ef890b20..04f8fbca922e6 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -510,6 +510,7 @@ def __hash__(self): # unittests for upstream projects "pyspark.tests.upstream.pyarrow.test_pyarrow_array_cast", "pyspark.tests.upstream.pyarrow.test_pyarrow_array_type_inference", + "pyspark.tests.upstream.pyarrow.test_pyarrow_arrow_to_pandas_default", "pyspark.tests.upstream.pyarrow.test_pyarrow_ignore_timezone", "pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_coercion", "pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_inference", diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index 89d8b035978cd..575bdbbaa2f22 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -54,7 +54,6 @@ import inspect import os import unittest -from collections import OrderedDict from decimal import Decimal from typing import Callable, List, Optional @@ -146,7 +145,7 @@ def _build_source_arrays(self): """Build an ordered dict of named source PyArrow arrays for testing.""" import pyarrow as pa - sources = OrderedDict() + sources = {} # ===================================================================== # Integer types From e3ff5c7865bb0dc2012684bb75703abd8ddeaf04 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:46:00 +0000 Subject: [PATCH 07/11] Rename index to 'case'; add 'input' column for source arrow array Co-authored-by: Isaac --- ...golden_pyarrow_arrow_to_pandas_default.csv | 244 ++++++++--------- .../golden_pyarrow_arrow_to_pandas_default.md | 246 +++++++++--------- .../test_pyarrow_arrow_to_pandas_default.py | 17 +- 3 files changed, 255 insertions(+), 252 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index 25c5a061b41d5..62d467114b928 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -1,122 +1,122 @@ -source to_pandas() -int8:standard [0, 1, -1, 127, -128]@Series[int8] -int8:nullable [0.0, 1.0, nan]@Series[float64] -int8:empty []@Series[int8] -int16:standard [0, 1, -1, 32767, -32768]@Series[int16] -int16:nullable [0.0, 1.0, nan]@Series[float64] -int16:empty []@Series[int16] -int32:standard [0, 1, -1, 2147483647, -2147483648]@Series[int32] -int32:nullable [0.0, 1.0, nan]@Series[float64] -int32:empty []@Series[int32] -int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] -int64:nullable [0.0, 1.0, nan]@Series[float64] -int64:empty []@Series[int64] -uint8:standard [0, 1, 255]@Series[uint8] -uint8:nullable [0.0, 1.0, nan]@Series[float64] -uint8:empty []@Series[uint8] -uint16:standard [0, 1, 65535]@Series[uint16] -uint16:nullable [0.0, 1.0, nan]@Series[float64] -uint16:empty []@Series[uint16] -uint32:standard [0, 1, 4294967295]@Series[uint32] -uint32:nullable [0.0, 1.0, nan]@Series[float64] -uint32:empty []@Series[uint32] -uint64:standard [0, 1, 18446744073709551615]@Series[uint64] -uint64:nullable [0.0, 1.0, nan]@Series[float64] -uint64:empty []@Series[uint64] -float32:standard [0.0, 1.5, -1.5]@Series[float32] -float32:nullable [0.0, 1.5, nan]@Series[float32] -float32:empty []@Series[float32] -float64:standard [0.0, 1.5, -1.5]@Series[float64] -float64:nullable [0.0, 1.5, nan]@Series[float64] -float64:special [nan, inf, -inf]@Series[float64] -float64:empty []@Series[float64] -bool:standard [True, False, True]@Series[bool] -bool:nullable [True, False, None]@Series[object] -bool:empty []@Series[bool] -string:standard ['hello', 'world', '']@Series[object] -string:nullable ['hello', None, 'world']@Series[object] -string:empty []@Series[object] -large_string:standard ['hello', 'world']@Series[object] -large_string:nullable ['hello', None]@Series[object] -large_string:empty []@Series[object] -binary:standard [b'hello', b'world']@Series[object] -binary:nullable [b'hello', None]@Series[object] -binary:empty []@Series[object] -large_binary:standard [b'hello', b'world']@Series[object] -large_binary:nullable [b'hello', None]@Series[object] -large_binary:empty []@Series[object] -decimal128:standard [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] -decimal128:nullable [Decimal('1.23'), None, Decimal('4.56')]@Series[object] -decimal128:empty []@Series[object] -date32:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] -date32:nullable [datetime.date(2024, 1, 1), None]@Series[object] -date32:empty []@Series[object] -date64:standard [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] -date64:nullable [datetime.date(2024, 1, 1), None]@Series[object] -date64:empty []@Series[object] -timestamp[s]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] -timestamp[s]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] -timestamp[s]:empty []@Series[datetime64[s]] -timestamp[ms]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] -timestamp[ms]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] -timestamp[ms]:empty []@Series[datetime64[ms]] -timestamp[us]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] -timestamp[us]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] -timestamp[us]:empty []@Series[datetime64[us]] -timestamp[ns]:standard [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] -timestamp[ns]:nullable [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] -timestamp[ns]:empty []@Series[datetime64[ns]] -timestamp[us,tz=UTC]:standard [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] -timestamp[us,tz=UTC]:nullable [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] -timestamp[us,tz=UTC]:empty []@Series[datetime64[us, UTC]] -duration[s]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] -duration[s]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] -duration[s]:empty []@Series[timedelta64[s]] -duration[ms]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] -duration[ms]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] -duration[ms]:empty []@Series[timedelta64[ms]] -duration[us]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] -duration[us]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] -duration[us]:empty []@Series[timedelta64[us]] -duration[ns]:standard [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] -duration[ns]:nullable [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] -duration[ns]:empty []@Series[timedelta64[ns]] -time32[s]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] -time32[s]:nullable [datetime.time(12, 30), None]@Series[object] -time32[s]:empty []@Series[object] -time32[ms]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] -time32[ms]:nullable [datetime.time(12, 30), None]@Series[object] -time32[ms]:empty []@Series[object] -time64[us]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] -time64[us]:nullable [datetime.time(12, 30), None]@Series[object] -time64[us]:empty []@Series[object] -time64[ns]:standard [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] -time64[ns]:nullable [datetime.time(12, 30), None]@Series[object] -time64[ns]:empty []@Series[object] -null:standard [None, None, None]@Series[object] -null:empty []@Series[object] -list:standard [array([1, 2]), array([3, 4, 5])]@Series[object] -list:nullable [array([1, 2]), None, array([3])]@Series[object] -list:empty []@Series[object] -list:standard [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] -large_list:standard [array([1, 2]), array([3, 4])]@Series[object] -large_list:empty []@Series[object] -fixed_size_list[3]:standard [array([1, 2, 3]), array([4, 5, 6])]@Series[object] -fixed_size_list[3]:empty []@Series[object] -struct:standard [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] -struct:nullable [{'x': 1, 'y': 'a'}, None]@Series[object] -struct:empty []@Series[object] -map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] -map:empty []@Series[object] -list>:standard [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] -list:standard [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] -list>:standard [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] -struct:standard [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] -struct>:standard [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] -struct>:standard [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] -map>:standard [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] -map:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] -map>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] -dictionary:standard ['a', 'b', 'a', 'b']@Series[category] -dictionary:nullable ['a', 'b', nan, 'a']@Series[category] -dictionary:empty []@Series[category] +case input to_pandas() +int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8] +int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64] +int8:empty []@int8 []@Series[int8] +int16:standard [0, 1, -1, 32767, -32768]@int16 [0, 1, -1, 32767, -32768]@Series[int16] +int16:nullable [0, 1, None]@int16 [0.0, 1.0, nan]@Series[float64] +int16:empty []@int16 []@Series[int16] +int32:standard [0, 1, -1, 2147483647, -2147483648]@int32 [0, 1, -1, 2147483647, -2147483648]@Series[int32] +int32:nullable [0, 1, None]@int32 [0.0, 1.0, nan]@Series[float64] +int32:empty []@int32 []@Series[int32] +int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@int64 [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] +int64:nullable [0, 1, None]@int64 [0.0, 1.0, nan]@Series[float64] +int64:empty []@int64 []@Series[int64] +uint8:standard [0, 1, 255]@uint8 [0, 1, 255]@Series[uint8] +uint8:nullable [0, 1, None]@uint8 [0.0, 1.0, nan]@Series[float64] +uint8:empty []@uint8 []@Series[uint8] +uint16:standard [0, 1, 65535]@uint16 [0, 1, 65535]@Series[uint16] +uint16:nullable [0, 1, None]@uint16 [0.0, 1.0, nan]@Series[float64] +uint16:empty []@uint16 []@Series[uint16] +uint32:standard [0, 1, 4294967295]@uint32 [0, 1, 4294967295]@Series[uint32] +uint32:nullable [0, 1, None]@uint32 [0.0, 1.0, nan]@Series[float64] +uint32:empty []@uint32 []@Series[uint32] +uint64:standard [0, 1, 18446744073709551615]@uint64 [0, 1, 18446744073709551615]@Series[uint64] +uint64:nullable [0, 1, None]@uint64 [0.0, 1.0, nan]@Series[float64] +uint64:empty []@uint64 []@Series[uint64] +float32:standard [0.0, 1.5, -1.5]@float32 [0.0, 1.5, -1.5]@Series[float32] +float32:nullable [0.0, 1.5, None]@float32 [0.0, 1.5, nan]@Series[float32] +float32:empty []@float32 []@Series[float32] +float64:standard [0.0, 1.5, -1.5]@float64 [0.0, 1.5, -1.5]@Series[float64] +float64:nullable [0.0, 1.5, None]@float64 [0.0, 1.5, nan]@Series[float64] +float64:special [nan, inf, -inf]@float64 [nan, inf, -inf]@Series[float64] +float64:empty []@float64 []@Series[float64] +bool:standard [True, False, True]@bool [True, False, True]@Series[bool] +bool:nullable [True, False, None]@bool [True, False, None]@Series[object] +bool:empty []@bool []@Series[bool] +string:standard [hello, world, ]@string ['hello', 'world', '']@Series[object] +string:nullable [hello, None, world]@string ['hello', None, 'world']@Series[object] +string:empty []@string []@Series[object] +large_string:standard [hello, world]@large_string ['hello', 'world']@Series[object] +large_string:nullable [hello, None]@large_string ['hello', None]@Series[object] +large_string:empty []@large_string []@Series[object] +binary:standard [b'hello', b'world']@binary [b'hello', b'world']@Series[object] +binary:nullable [b'hello', None]@binary [b'hello', None]@Series[object] +binary:empty []@binary []@Series[object] +large_binary:standard [b'hello', b'world']@large_binary [b'hello', b'world']@Series[object] +large_binary:nullable [b'hello', None]@large_binary [b'hello', None]@Series[object] +large_binary:empty []@large_binary []@Series[object] +decimal128:standard [1.23, 4.56, -7.89]@decimal128(5, 2) [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] +decimal128:nullable [1.23, None, 4.56]@decimal128(5, 2) [Decimal('1.23'), None, Decimal('4.56')]@Series[object] +decimal128:empty []@decimal128(5, 2) []@Series[object] +date32:standard [2024-01-01, 2024-06-15]@date32[day] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date32:nullable [2024-01-01, None]@date32[day] [datetime.date(2024, 1, 1), None]@Series[object] +date32:empty []@date32[day] []@Series[object] +date64:standard [2024-01-01, 2024-06-15]@date64[ms] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date64:nullable [2024-01-01, None]@date64[ms] [datetime.date(2024, 1, 1), None]@Series[object] +date64:empty []@date64[ms] []@Series[object] +timestamp[s]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] +timestamp[s]:nullable [2024-01-01 12:00:00, None]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] +timestamp[s]:empty []@timestamp[s] []@Series[datetime64[s]] +timestamp[ms]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] +timestamp[ms]:nullable [2024-01-01 12:00:00, None]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] +timestamp[ms]:empty []@timestamp[ms] []@Series[datetime64[ms]] +timestamp[us]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] +timestamp[us]:nullable [2024-01-01 12:00:00, None]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] +timestamp[us]:empty []@timestamp[us] []@Series[datetime64[us]] +timestamp[ns]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] +timestamp[ns]:nullable [2024-01-01 12:00:00, None]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] +timestamp[ns]:empty []@timestamp[ns] []@Series[datetime64[ns]] +timestamp[us,tz=UTC]:standard [2024-01-01 12:00:00+00:00, 2024-06-15 18:30:00+00:00]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:nullable [2024-01-01 12:00:00+00:00, None]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:empty []@timestamp[us, tz=UTC] []@Series[datetime64[us, UTC]] +duration[s]:standard [1 day, 0:00:00, 2:30:00]@duration[s] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] +duration[s]:nullable [1 day, 0:00:00, None]@duration[s] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] +duration[s]:empty []@duration[s] []@Series[timedelta64[s]] +duration[ms]:standard [1 day, 0:00:00, 2:30:00]@duration[ms] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] +duration[ms]:nullable [1 day, 0:00:00, None]@duration[ms] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] +duration[ms]:empty []@duration[ms] []@Series[timedelta64[ms]] +duration[us]:standard [1 day, 0:00:00, 2:30:00]@duration[us] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] +duration[us]:nullable [1 day, 0:00:00, None]@duration[us] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] +duration[us]:empty []@duration[us] []@Series[timedelta64[us]] +duration[ns]:standard [1 days 00:00:00, 0 days 02:30:00]@duration[ns] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] +duration[ns]:nullable [1 days 00:00:00, None]@duration[ns] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] +duration[ns]:empty []@duration[ns] []@Series[timedelta64[ns]] +time32[s]:standard [12:30:00, 18:45:30]@time32[s] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[s]:nullable [12:30:00, None]@time32[s] [datetime.time(12, 30), None]@Series[object] +time32[s]:empty []@time32[s] []@Series[object] +time32[ms]:standard [12:30:00, 18:45:30]@time32[ms] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[ms]:nullable [12:30:00, None]@time32[ms] [datetime.time(12, 30), None]@Series[object] +time32[ms]:empty []@time32[ms] []@Series[object] +time64[us]:standard [12:30:00, 18:45:30]@time64[us] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[us]:nullable [12:30:00, None]@time64[us] [datetime.time(12, 30), None]@Series[object] +time64[us]:empty []@time64[us] []@Series[object] +time64[ns]:standard [12:30:00, 18:45:30]@time64[ns] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[ns]:nullable [12:30:00, None]@time64[ns] [datetime.time(12, 30), None]@Series[object] +time64[ns]:empty []@time64[ns] []@Series[object] +null:standard [None, None, None]@null [None, None, None]@Series[object] +null:empty []@null []@Series[object] +list:standard [[1, 2], [3, 4, 5]]@list [array([1, 2]), array([3, 4, 5])]@Series[object] +list:nullable [[1, 2], None, [3]]@list [array([1, 2]), None, array([3])]@Series[object] +list:empty []@list []@Series[object] +list:standard [['a', 'b'], ['c']]@list [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] +large_list:standard [[1, 2], [3, 4]]@large_list [array([1, 2]), array([3, 4])]@Series[object] +large_list:empty []@large_list []@Series[object] +fixed_size_list[3]:standard [[1, 2, 3], [4, 5, 6]]@fixed_size_list[3] [array([1, 2, 3]), array([4, 5, 6])]@Series[object] +fixed_size_list[3]:empty []@fixed_size_list[3] []@Series[object] +struct:standard [[('x', 1), ('y', 'a')], [('x', 2), ('y', 'b')]]@struct [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] +struct:nullable [[('x', 1), ('y', 'a')], None]@struct [{'x': 1, 'y': 'a'}, None]@Series[object] +struct:empty []@struct []@Series[object] +map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@map [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] +map:empty []@map []@Series[object] +list>:standard [[[1, 2], [3]], [[4, 5, 6]]]@list> [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] +list:standard [[{'x': 1}, {'x': 2}], [{'x': 3}]]@list> [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] +list>:standard [[[('a', 1)], [('b', 2)]], [[('c', 3)]]]@list> [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] +struct:standard [[('outer', {'inner': 1})], [('outer', {'inner': 2})]]@struct> [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] +struct>:standard [[('items', [1, 2, 3])], [('items', [4, 5])]]@struct> [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] +struct>:standard [[('mapping', [('a', 1)])], [('mapping', [('b', 2)])]]@struct> [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] +map>:standard [[('a', [1, 2]), ('b', [3])], [('c', [4, 5, 6])]]@map> [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] +map:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@map> [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] +map>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@map> [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] +dictionary:standard [a, b, a, b]@dictionary ['a', 'b', 'a', 'b']@Series[category] +dictionary:nullable [a, b, None, a]@dictionary ['a', 'b', nan, 'a']@Series[category] +dictionary:empty []@dictionary []@Series[category] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index 1c9e207b92dcf..b31c9349acaee 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -1,123 +1,123 @@ -| source | to_pandas() | -|----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| int8:standard | [0, 1, -1, 127, -128]@Series[int8] | -| int8:nullable | [0.0, 1.0, nan]@Series[float64] | -| int8:empty | []@Series[int8] | -| int16:standard | [0, 1, -1, 32767, -32768]@Series[int16] | -| int16:nullable | [0.0, 1.0, nan]@Series[float64] | -| int16:empty | []@Series[int16] | -| int32:standard | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | -| int32:nullable | [0.0, 1.0, nan]@Series[float64] | -| int32:empty | []@Series[int32] | -| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | -| int64:nullable | [0.0, 1.0, nan]@Series[float64] | -| int64:empty | []@Series[int64] | -| uint8:standard | [0, 1, 255]@Series[uint8] | -| uint8:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint8:empty | []@Series[uint8] | -| uint16:standard | [0, 1, 65535]@Series[uint16] | -| uint16:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint16:empty | []@Series[uint16] | -| uint32:standard | [0, 1, 4294967295]@Series[uint32] | -| uint32:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint32:empty | []@Series[uint32] | -| uint64:standard | [0, 1, 18446744073709551615]@Series[uint64] | -| uint64:nullable | [0.0, 1.0, nan]@Series[float64] | -| uint64:empty | []@Series[uint64] | -| float32:standard | [0.0, 1.5, -1.5]@Series[float32] | -| float32:nullable | [0.0, 1.5, nan]@Series[float32] | -| float32:empty | []@Series[float32] | -| float64:standard | [0.0, 1.5, -1.5]@Series[float64] | -| float64:nullable | [0.0, 1.5, nan]@Series[float64] | -| float64:special | [nan, inf, -inf]@Series[float64] | -| float64:empty | []@Series[float64] | -| bool:standard | [True, False, True]@Series[bool] | -| bool:nullable | [True, False, None]@Series[object] | -| bool:empty | []@Series[bool] | -| string:standard | ['hello', 'world', '']@Series[object] | -| string:nullable | ['hello', None, 'world']@Series[object] | -| string:empty | []@Series[object] | -| large_string:standard | ['hello', 'world']@Series[object] | -| large_string:nullable | ['hello', None]@Series[object] | -| large_string:empty | []@Series[object] | -| binary:standard | [b'hello', b'world']@Series[object] | -| binary:nullable | [b'hello', None]@Series[object] | -| binary:empty | []@Series[object] | -| large_binary:standard | [b'hello', b'world']@Series[object] | -| large_binary:nullable | [b'hello', None]@Series[object] | -| large_binary:empty | []@Series[object] | -| decimal128:standard | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | -| decimal128:nullable | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | -| decimal128:empty | []@Series[object] | -| date32:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | -| date32:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | -| date32:empty | []@Series[object] | -| date64:standard | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | -| date64:nullable | [datetime.date(2024, 1, 1), None]@Series[object] | -| date64:empty | []@Series[object] | -| timestamp[s]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | -| timestamp[s]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | -| timestamp[s]:empty | []@Series[datetime64[s]] | -| timestamp[ms]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | -| timestamp[ms]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | -| timestamp[ms]:empty | []@Series[datetime64[ms]] | -| timestamp[us]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | -| timestamp[us]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | -| timestamp[us]:empty | []@Series[datetime64[us]] | -| timestamp[ns]:standard | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | -| timestamp[ns]:nullable | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | -| timestamp[ns]:empty | []@Series[datetime64[ns]] | -| timestamp[us,tz=UTC]:standard | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | -| timestamp[us,tz=UTC]:nullable | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | -| timestamp[us,tz=UTC]:empty | []@Series[datetime64[us, UTC]] | -| duration[s]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | -| duration[s]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | -| duration[s]:empty | []@Series[timedelta64[s]] | -| duration[ms]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | -| duration[ms]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | -| duration[ms]:empty | []@Series[timedelta64[ms]] | -| duration[us]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | -| duration[us]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | -| duration[us]:empty | []@Series[timedelta64[us]] | -| duration[ns]:standard | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | -| duration[ns]:nullable | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | -| duration[ns]:empty | []@Series[timedelta64[ns]] | -| time32[s]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time32[s]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time32[s]:empty | []@Series[object] | -| time32[ms]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time32[ms]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time32[ms]:empty | []@Series[object] | -| time64[us]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time64[us]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time64[us]:empty | []@Series[object] | -| time64[ns]:standard | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | -| time64[ns]:nullable | [datetime.time(12, 30), None]@Series[object] | -| time64[ns]:empty | []@Series[object] | -| null:standard | [None, None, None]@Series[object] | -| null:empty | []@Series[object] | -| list:standard | [array([1, 2]), array([3, 4, 5])]@Series[object] | -| list:nullable | [array([1, 2]), None, array([3])]@Series[object] | -| list:empty | []@Series[object] | -| list:standard | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | -| large_list:standard | [array([1, 2]), array([3, 4])]@Series[object] | -| large_list:empty | []@Series[object] | -| fixed_size_list[3]:standard | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | -| fixed_size_list[3]:empty | []@Series[object] | -| struct:standard | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | -| struct:nullable | [{'x': 1, 'y': 'a'}, None]@Series[object] | -| struct:empty | []@Series[object] | -| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | -| map:empty | []@Series[object] | -| list>:standard | [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] | -| list:standard | [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] | -| list>:standard | [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] | -| struct:standard | [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] | -| struct>:standard | [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] | -| struct>:standard | [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] | -| map>:standard | [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] | -| map:standard | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] | -| map>:standard | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] | -| dictionary:standard | ['a', 'b', 'a', 'b']@Series[category] | -| dictionary:nullable | ['a', 'b', nan, 'a']@Series[category] | -| dictionary:empty | []@Series[category] | \ No newline at end of file +| case | input | to_pandas() | +|----------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| int8:standard | [0, 1, -1, 127, -128]@int8 | [0, 1, -1, 127, -128]@Series[int8] | +| int8:nullable | [0, 1, None]@int8 | [0.0, 1.0, nan]@Series[float64] | +| int8:empty | []@int8 | []@Series[int8] | +| int16:standard | [0, 1, -1, 32767, -32768]@int16 | [0, 1, -1, 32767, -32768]@Series[int16] | +| int16:nullable | [0, 1, None]@int16 | [0.0, 1.0, nan]@Series[float64] | +| int16:empty | []@int16 | []@Series[int16] | +| int32:standard | [0, 1, -1, 2147483647, -2147483648]@int32 | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | +| int32:nullable | [0, 1, None]@int32 | [0.0, 1.0, nan]@Series[float64] | +| int32:empty | []@int32 | []@Series[int32] | +| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@int64 | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | +| int64:nullable | [0, 1, None]@int64 | [0.0, 1.0, nan]@Series[float64] | +| int64:empty | []@int64 | []@Series[int64] | +| uint8:standard | [0, 1, 255]@uint8 | [0, 1, 255]@Series[uint8] | +| uint8:nullable | [0, 1, None]@uint8 | [0.0, 1.0, nan]@Series[float64] | +| uint8:empty | []@uint8 | []@Series[uint8] | +| uint16:standard | [0, 1, 65535]@uint16 | [0, 1, 65535]@Series[uint16] | +| uint16:nullable | [0, 1, None]@uint16 | [0.0, 1.0, nan]@Series[float64] | +| uint16:empty | []@uint16 | []@Series[uint16] | +| uint32:standard | [0, 1, 4294967295]@uint32 | [0, 1, 4294967295]@Series[uint32] | +| uint32:nullable | [0, 1, None]@uint32 | [0.0, 1.0, nan]@Series[float64] | +| uint32:empty | []@uint32 | []@Series[uint32] | +| uint64:standard | [0, 1, 18446744073709551615]@uint64 | [0, 1, 18446744073709551615]@Series[uint64] | +| uint64:nullable | [0, 1, None]@uint64 | [0.0, 1.0, nan]@Series[float64] | +| uint64:empty | []@uint64 | []@Series[uint64] | +| float32:standard | [0.0, 1.5, -1.5]@float32 | [0.0, 1.5, -1.5]@Series[float32] | +| float32:nullable | [0.0, 1.5, None]@float32 | [0.0, 1.5, nan]@Series[float32] | +| float32:empty | []@float32 | []@Series[float32] | +| float64:standard | [0.0, 1.5, -1.5]@float64 | [0.0, 1.5, -1.5]@Series[float64] | +| float64:nullable | [0.0, 1.5, None]@float64 | [0.0, 1.5, nan]@Series[float64] | +| float64:special | [nan, inf, -inf]@float64 | [nan, inf, -inf]@Series[float64] | +| float64:empty | []@float64 | []@Series[float64] | +| bool:standard | [True, False, True]@bool | [True, False, True]@Series[bool] | +| bool:nullable | [True, False, None]@bool | [True, False, None]@Series[object] | +| bool:empty | []@bool | []@Series[bool] | +| string:standard | [hello, world, ]@string | ['hello', 'world', '']@Series[object] | +| string:nullable | [hello, None, world]@string | ['hello', None, 'world']@Series[object] | +| string:empty | []@string | []@Series[object] | +| large_string:standard | [hello, world]@large_string | ['hello', 'world']@Series[object] | +| large_string:nullable | [hello, None]@large_string | ['hello', None]@Series[object] | +| large_string:empty | []@large_string | []@Series[object] | +| binary:standard | [b'hello', b'world']@binary | [b'hello', b'world']@Series[object] | +| binary:nullable | [b'hello', None]@binary | [b'hello', None]@Series[object] | +| binary:empty | []@binary | []@Series[object] | +| large_binary:standard | [b'hello', b'world']@large_binary | [b'hello', b'world']@Series[object] | +| large_binary:nullable | [b'hello', None]@large_binary | [b'hello', None]@Series[object] | +| large_binary:empty | []@large_binary | []@Series[object] | +| decimal128:standard | [1.23, 4.56, -7.89]@decimal128(5, 2) | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | +| decimal128:nullable | [1.23, None, 4.56]@decimal128(5, 2) | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | +| decimal128:empty | []@decimal128(5, 2) | []@Series[object] | +| date32:standard | [2024-01-01, 2024-06-15]@date32[day] | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date32:nullable | [2024-01-01, None]@date32[day] | [datetime.date(2024, 1, 1), None]@Series[object] | +| date32:empty | []@date32[day] | []@Series[object] | +| date64:standard | [2024-01-01, 2024-06-15]@date64[ms] | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date64:nullable | [2024-01-01, None]@date64[ms] | [datetime.date(2024, 1, 1), None]@Series[object] | +| date64:empty | []@date64[ms] | []@Series[object] | +| timestamp[s]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[s] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | +| timestamp[s]:nullable | [2024-01-01 12:00:00, None]@timestamp[s] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | +| timestamp[s]:empty | []@timestamp[s] | []@Series[datetime64[s]] | +| timestamp[ms]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ms] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | +| timestamp[ms]:nullable | [2024-01-01 12:00:00, None]@timestamp[ms] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | +| timestamp[ms]:empty | []@timestamp[ms] | []@Series[datetime64[ms]] | +| timestamp[us]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[us] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | +| timestamp[us]:nullable | [2024-01-01 12:00:00, None]@timestamp[us] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | +| timestamp[us]:empty | []@timestamp[us] | []@Series[datetime64[us]] | +| timestamp[ns]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ns] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | +| timestamp[ns]:nullable | [2024-01-01 12:00:00, None]@timestamp[ns] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | +| timestamp[ns]:empty | []@timestamp[ns] | []@Series[datetime64[ns]] | +| timestamp[us,tz=UTC]:standard | [2024-01-01 12:00:00+00:00, 2024-06-15 18:30:00+00:00]@timestamp[us, tz=UTC] | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:nullable | [2024-01-01 12:00:00+00:00, None]@timestamp[us, tz=UTC] | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:empty | []@timestamp[us, tz=UTC] | []@Series[datetime64[us, UTC]] | +| duration[s]:standard | [1 day, 0:00:00, 2:30:00]@duration[s] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | +| duration[s]:nullable | [1 day, 0:00:00, None]@duration[s] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | +| duration[s]:empty | []@duration[s] | []@Series[timedelta64[s]] | +| duration[ms]:standard | [1 day, 0:00:00, 2:30:00]@duration[ms] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | +| duration[ms]:nullable | [1 day, 0:00:00, None]@duration[ms] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | +| duration[ms]:empty | []@duration[ms] | []@Series[timedelta64[ms]] | +| duration[us]:standard | [1 day, 0:00:00, 2:30:00]@duration[us] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | +| duration[us]:nullable | [1 day, 0:00:00, None]@duration[us] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | +| duration[us]:empty | []@duration[us] | []@Series[timedelta64[us]] | +| duration[ns]:standard | [1 days 00:00:00, 0 days 02:30:00]@duration[ns] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | +| duration[ns]:nullable | [1 days 00:00:00, None]@duration[ns] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | +| duration[ns]:empty | []@duration[ns] | []@Series[timedelta64[ns]] | +| time32[s]:standard | [12:30:00, 18:45:30]@time32[s] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[s]:nullable | [12:30:00, None]@time32[s] | [datetime.time(12, 30), None]@Series[object] | +| time32[s]:empty | []@time32[s] | []@Series[object] | +| time32[ms]:standard | [12:30:00, 18:45:30]@time32[ms] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[ms]:nullable | [12:30:00, None]@time32[ms] | [datetime.time(12, 30), None]@Series[object] | +| time32[ms]:empty | []@time32[ms] | []@Series[object] | +| time64[us]:standard | [12:30:00, 18:45:30]@time64[us] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[us]:nullable | [12:30:00, None]@time64[us] | [datetime.time(12, 30), None]@Series[object] | +| time64[us]:empty | []@time64[us] | []@Series[object] | +| time64[ns]:standard | [12:30:00, 18:45:30]@time64[ns] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[ns]:nullable | [12:30:00, None]@time64[ns] | [datetime.time(12, 30), None]@Series[object] | +| time64[ns]:empty | []@time64[ns] | []@Series[object] | +| null:standard | [None, None, None]@null | [None, None, None]@Series[object] | +| null:empty | []@null | []@Series[object] | +| list:standard | [[1, 2], [3, 4, 5]]@list | [array([1, 2]), array([3, 4, 5])]@Series[object] | +| list:nullable | [[1, 2], None, [3]]@list | [array([1, 2]), None, array([3])]@Series[object] | +| list:empty | []@list | []@Series[object] | +| list:standard | [['a', 'b'], ['c']]@list | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | +| large_list:standard | [[1, 2], [3, 4]]@large_list | [array([1, 2]), array([3, 4])]@Series[object] | +| large_list:empty | []@large_list | []@Series[object] | +| fixed_size_list[3]:standard | [[1, 2, 3], [4, 5, 6]]@fixed_size_list[3] | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | +| fixed_size_list[3]:empty | []@fixed_size_list[3] | []@Series[object] | +| struct:standard | [[('x', 1), ('y', 'a')], [('x', 2), ('y', 'b')]]@struct | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | +| struct:nullable | [[('x', 1), ('y', 'a')], None]@struct | [{'x': 1, 'y': 'a'}, None]@Series[object] | +| struct:empty | []@struct | []@Series[object] | +| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@map | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | +| map:empty | []@map | []@Series[object] | +| list>:standard | [[[1, 2], [3]], [[4, 5, 6]]]@list> | [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] | +| list:standard | [[{'x': 1}, {'x': 2}], [{'x': 3}]]@list> | [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] | +| list>:standard | [[[('a', 1)], [('b', 2)]], [[('c', 3)]]]@list> | [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] | +| struct:standard | [[('outer', {'inner': 1})], [('outer', {'inner': 2})]]@struct> | [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] | +| struct>:standard | [[('items', [1, 2, 3])], [('items', [4, 5])]]@struct> | [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] | +| struct>:standard | [[('mapping', [('a', 1)])], [('mapping', [('b', 2)])]]@struct> | [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] | +| map>:standard | [[('a', [1, 2]), ('b', [3])], [('c', [4, 5, 6])]]@map> | [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] | +| map:standard | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@map> | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] | +| map>:standard | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@map> | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] | +| dictionary:standard | [a, b, a, b]@dictionary | ['a', 'b', 'a', 'b']@Series[category] | +| dictionary:nullable | [a, b, None, a]@dictionary | ['a', 'b', nan, 'a']@Series[category] | +| dictionary:empty | []@dictionary | []@Series[category] | \ No newline at end of file diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index 575bdbbaa2f22..7e072c5057b53 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -436,22 +436,25 @@ def test_to_pandas_default(self): """Test pa.Array.to_pandas() with default arguments against golden file.""" sources = self._build_source_arrays() row_names = list(sources.keys()) - col_names = ["to_pandas()"] + col_names = ["input", "to_pandas()"] def compute_cell(row_name, col_name): arr = sources[row_name] - try: - result = arr.to_pandas() - return self.repr_value(result, max_len=0) - except Exception as e: - return f"ERR@{type(e).__name__}" + if col_name == "input": + return self.repr_value(arr, max_len=0) + else: + try: + result = arr.to_pandas() + return self.repr_value(result, max_len=0) + except Exception as e: + return f"ERR@{type(e).__name__}" self.compare_or_generate_golden_matrix( row_names=row_names, col_names=col_names, compute_cell=compute_cell, golden_file_prefix="golden_pyarrow_arrow_to_pandas_default", - index_name="source", + index_name="case", ) From 0b54db0069ef382a3f3bc7728d7d9d8e3841332d Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:48:15 +0000 Subject: [PATCH 08/11] Rename 'input' column to 'value' in golden file Co-authored-by: Isaac --- .../pyarrow/golden_pyarrow_arrow_to_pandas_default.csv | 2 +- .../pyarrow/golden_pyarrow_arrow_to_pandas_default.md | 2 +- .../upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index 62d467114b928..0309e1dd9cf25 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -1,4 +1,4 @@ -case input to_pandas() +case value to_pandas() int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8] int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64] int8:empty []@int8 []@Series[int8] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index b31c9349acaee..100ded3daa7be 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -1,4 +1,4 @@ -| case | input | to_pandas() | +| case | value | to_pandas() | |----------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | int8:standard | [0, 1, -1, 127, -128]@int8 | [0, 1, -1, 127, -128]@Series[int8] | | int8:nullable | [0, 1, None]@int8 | [0.0, 1.0, nan]@Series[float64] | diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index 7e072c5057b53..62f46ce4c95c9 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -436,11 +436,11 @@ def test_to_pandas_default(self): """Test pa.Array.to_pandas() with default arguments against golden file.""" sources = self._build_source_arrays() row_names = list(sources.keys()) - col_names = ["input", "to_pandas()"] + col_names = ["value", "to_pandas()"] def compute_cell(row_name, col_name): arr = sources[row_name] - if col_name == "input": + if col_name == "value": return self.repr_value(arr, max_len=0) else: try: From 5739e098c7201faab00e18121319237c1d182aba Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:49:18 +0000 Subject: [PATCH 09/11] Rename columns to 'case', 'pyarrow array', 'pandas series' Co-authored-by: Isaac --- .../pyarrow/golden_pyarrow_arrow_to_pandas_default.csv | 2 +- .../pyarrow/golden_pyarrow_arrow_to_pandas_default.md | 2 +- .../upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index 0309e1dd9cf25..fb08f60f284b1 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -1,4 +1,4 @@ -case value to_pandas() +case pyarrow array pandas series int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8] int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64] int8:empty []@int8 []@Series[int8] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index 100ded3daa7be..a9d74b9265682 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -1,4 +1,4 @@ -| case | value | to_pandas() | +| case | pyarrow array | pandas series | |----------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | int8:standard | [0, 1, -1, 127, -128]@int8 | [0, 1, -1, 127, -128]@Series[int8] | | int8:nullable | [0, 1, None]@int8 | [0.0, 1.0, nan]@Series[float64] | diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index 62f46ce4c95c9..fa1e14198bee9 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -436,11 +436,11 @@ def test_to_pandas_default(self): """Test pa.Array.to_pandas() with default arguments against golden file.""" sources = self._build_source_arrays() row_names = list(sources.keys()) - col_names = ["value", "to_pandas()"] + col_names = ["pyarrow array", "pandas series"] def compute_cell(row_name, col_name): arr = sources[row_name] - if col_name == "value": + if col_name == "pyarrow array": return self.repr_value(arr, max_len=0) else: try: From 633e0a780f6d575169579eba18f89a7672083b2f Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 08:49:45 +0000 Subject: [PATCH 10/11] Rename index column from 'case' to 'test case' Co-authored-by: Isaac --- .../upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv | 2 +- .../upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md | 2 +- .../upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv index fb08f60f284b1..13450e5634edf 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -1,4 +1,4 @@ -case pyarrow array pandas series +test case pyarrow array pandas series int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8] int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64] int8:empty []@int8 []@Series[int8] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md index a9d74b9265682..04debdc77e03b 100644 --- a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -1,4 +1,4 @@ -| case | pyarrow array | pandas series | +| test case | pyarrow array | pandas series | |----------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | int8:standard | [0, 1, -1, 127, -128]@int8 | [0, 1, -1, 127, -128]@Series[int8] | | int8:nullable | [0, 1, None]@int8 | [0.0, 1.0, nan]@Series[float64] | diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index fa1e14198bee9..f1f7da8b06597 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -454,7 +454,7 @@ def compute_cell(row_name, col_name): col_names=col_names, compute_cell=compute_cell, golden_file_prefix="golden_pyarrow_arrow_to_pandas_default", - index_name="case", + index_name="test case", ) From 93de2388daa24248c04b1721835c53733ff63100 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Mar 2026 10:34:02 +0000 Subject: [PATCH 11/11] Reformat with ruff Co-authored-by: Isaac --- .../test_pyarrow_arrow_to_pandas_default.py | 112 +++++------------- 1 file changed, 31 insertions(+), 81 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py index f1f7da8b06597..8caac302f2e6c 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -158,9 +158,7 @@ def _build_source_arrays(self): ]: max_val = 2 ** (bits - 1) - 1 min_val = -(2 ** (bits - 1)) - sources[f"int{bits}:standard"] = pa.array( - [0, 1, -1, max_val, min_val], pa_type - ) + sources[f"int{bits}:standard"] = pa.array([0, 1, -1, max_val, min_val], pa_type) sources[f"int{bits}:nullable"] = pa.array([0, 1, None], pa_type) sources[f"int{bits}:empty"] = pa.array([], pa_type) @@ -198,35 +196,21 @@ def _build_source_arrays(self): # ===================================================================== # String types # ===================================================================== - sources["string:standard"] = pa.array( - ["hello", "world", ""], pa.string() - ) - sources["string:nullable"] = pa.array( - ["hello", None, "world"], pa.string() - ) + sources["string:standard"] = pa.array(["hello", "world", ""], pa.string()) + sources["string:nullable"] = pa.array(["hello", None, "world"], pa.string()) sources["string:empty"] = pa.array([], pa.string()) - sources["large_string:standard"] = pa.array( - ["hello", "world"], pa.large_string() - ) - sources["large_string:nullable"] = pa.array( - ["hello", None], pa.large_string() - ) + sources["large_string:standard"] = pa.array(["hello", "world"], pa.large_string()) + sources["large_string:nullable"] = pa.array(["hello", None], pa.large_string()) sources["large_string:empty"] = pa.array([], pa.large_string()) # ===================================================================== # Binary types # ===================================================================== - sources["binary:standard"] = pa.array( - [b"hello", b"world"], pa.binary() - ) + sources["binary:standard"] = pa.array([b"hello", b"world"], pa.binary()) sources["binary:nullable"] = pa.array([b"hello", None], pa.binary()) sources["binary:empty"] = pa.array([], pa.binary()) - sources["large_binary:standard"] = pa.array( - [b"hello", b"world"], pa.large_binary() - ) - sources["large_binary:nullable"] = pa.array( - [b"hello", None], pa.large_binary() - ) + sources["large_binary:standard"] = pa.array([b"hello", b"world"], pa.large_binary()) + sources["large_binary:nullable"] = pa.array([b"hello", None], pa.large_binary()) sources["large_binary:empty"] = pa.array([], pa.large_binary()) # ===================================================================== @@ -259,15 +243,9 @@ def _build_source_arrays(self): dt1 = datetime.datetime(2024, 1, 1, 12, 0, 0) dt2 = datetime.datetime(2024, 6, 15, 18, 30, 0) for unit in ["s", "ms", "us", "ns"]: - sources[f"timestamp[{unit}]:standard"] = pa.array( - [dt1, dt2], pa.timestamp(unit) - ) - sources[f"timestamp[{unit}]:nullable"] = pa.array( - [dt1, None], pa.timestamp(unit) - ) - sources[f"timestamp[{unit}]:empty"] = pa.array( - [], pa.timestamp(unit) - ) + sources[f"timestamp[{unit}]:standard"] = pa.array([dt1, dt2], pa.timestamp(unit)) + sources[f"timestamp[{unit}]:nullable"] = pa.array([dt1, None], pa.timestamp(unit)) + sources[f"timestamp[{unit}]:empty"] = pa.array([], pa.timestamp(unit)) # Timestamp with timezone sources["timestamp[us,tz=UTC]:standard"] = pa.array( [dt1, dt2], pa.timestamp("us", tz="UTC") @@ -275,9 +253,7 @@ def _build_source_arrays(self): sources["timestamp[us,tz=UTC]:nullable"] = pa.array( [dt1, None], pa.timestamp("us", tz="UTC") ) - sources["timestamp[us,tz=UTC]:empty"] = pa.array( - [], pa.timestamp("us", tz="UTC") - ) + sources["timestamp[us,tz=UTC]:empty"] = pa.array([], pa.timestamp("us", tz="UTC")) # ===================================================================== # Duration types @@ -285,15 +261,9 @@ def _build_source_arrays(self): td1 = datetime.timedelta(days=1) td2 = datetime.timedelta(hours=2, minutes=30) for unit in ["s", "ms", "us", "ns"]: - sources[f"duration[{unit}]:standard"] = pa.array( - [td1, td2], pa.duration(unit) - ) - sources[f"duration[{unit}]:nullable"] = pa.array( - [td1, None], pa.duration(unit) - ) - sources[f"duration[{unit}]:empty"] = pa.array( - [], pa.duration(unit) - ) + sources[f"duration[{unit}]:standard"] = pa.array([td1, td2], pa.duration(unit)) + sources[f"duration[{unit}]:nullable"] = pa.array([td1, None], pa.duration(unit)) + sources[f"duration[{unit}]:empty"] = pa.array([], pa.duration(unit)) # ===================================================================== # Time types @@ -322,28 +292,18 @@ def _build_source_arrays(self): # ===================================================================== # Nested types # ===================================================================== - sources["list:standard"] = pa.array( - [[1, 2], [3, 4, 5]], pa.list_(pa.int64()) - ) - sources["list:nullable"] = pa.array( - [[1, 2], None, [3]], pa.list_(pa.int64()) - ) + sources["list:standard"] = pa.array([[1, 2], [3, 4, 5]], pa.list_(pa.int64())) + sources["list:nullable"] = pa.array([[1, 2], None, [3]], pa.list_(pa.int64())) sources["list:empty"] = pa.array([], pa.list_(pa.int64())) - sources["list:standard"] = pa.array( - [["a", "b"], ["c"]], pa.list_(pa.string()) - ) + sources["list:standard"] = pa.array([["a", "b"], ["c"]], pa.list_(pa.string())) sources["large_list:standard"] = pa.array( [[1, 2], [3, 4]], pa.large_list(pa.int64()) ) - sources["large_list:empty"] = pa.array( - [], pa.large_list(pa.int64()) - ) + sources["large_list:empty"] = pa.array([], pa.large_list(pa.int64())) sources["fixed_size_list[3]:standard"] = pa.array( [[1, 2, 3], [4, 5, 6]], pa.list_(pa.int64(), 3) ) - sources["fixed_size_list[3]:empty"] = pa.array( - [], pa.list_(pa.int64(), 3) - ) + sources["fixed_size_list[3]:empty"] = pa.array([], pa.list_(pa.int64(), 3)) sources["struct:standard"] = pa.array( [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], pa.struct([("x", pa.int64()), ("y", pa.string())]), @@ -352,16 +312,12 @@ def _build_source_arrays(self): [{"x": 1, "y": "a"}, None], pa.struct([("x", pa.int64()), ("y", pa.string())]), ) - sources["struct:empty"] = pa.array( - [], pa.struct([("x", pa.int64()), ("y", pa.string())]) - ) + sources["struct:empty"] = pa.array([], pa.struct([("x", pa.int64()), ("y", pa.string())])) sources["map:standard"] = pa.array( [[("a", 1), ("b", 2)], [("c", 3)]], pa.map_(pa.string(), pa.int64()), ) - sources["map:empty"] = pa.array( - [], pa.map_(pa.string(), pa.int64()) - ) + sources["map:empty"] = pa.array([], pa.map_(pa.string(), pa.int64())) # list of list (nested list) sources["list>:standard"] = pa.array( [[[1, 2], [3]], [[4, 5, 6]]], @@ -411,23 +367,17 @@ def _build_source_arrays(self): # ===================================================================== # Dictionary type # ===================================================================== - sources["dictionary:standard"] = ( - pa.DictionaryArray.from_arrays( - pa.array([0, 1, 0, 1], pa.int32()), - pa.array(["a", "b"], pa.string()), - ) + sources["dictionary:standard"] = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 0, 1], pa.int32()), + pa.array(["a", "b"], pa.string()), ) - sources["dictionary:nullable"] = ( - pa.DictionaryArray.from_arrays( - pa.array([0, 1, None, 0], pa.int32()), - pa.array(["a", "b"], pa.string()), - ) + sources["dictionary:nullable"] = pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0], pa.int32()), + pa.array(["a", "b"], pa.string()), ) - sources["dictionary:empty"] = ( - pa.DictionaryArray.from_arrays( - pa.array([], pa.int32()), - pa.array([], pa.string()), - ) + sources["dictionary:empty"] = pa.DictionaryArray.from_arrays( + pa.array([], pa.int32()), + pa.array([], pa.string()), ) return sources