diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 070d5ef890b20..04f8fbca922e6 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -510,6 +510,7 @@ def __hash__(self): # unittests for upstream projects "pyspark.tests.upstream.pyarrow.test_pyarrow_array_cast", "pyspark.tests.upstream.pyarrow.test_pyarrow_array_type_inference", + "pyspark.tests.upstream.pyarrow.test_pyarrow_arrow_to_pandas_default", "pyspark.tests.upstream.pyarrow.test_pyarrow_ignore_timezone", "pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_coercion", "pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_inference", diff --git a/python/pyspark/testing/goldenutils.py b/python/pyspark/testing/goldenutils.py index f18add2201db8..b1e4af599cd16 100644 --- a/python/pyspark/testing/goldenutils.py +++ b/python/pyspark/testing/goldenutils.py @@ -345,11 +345,38 @@ def repr_value(cls, value: Any, max_len: int = 32) -> str: if have_pandas and isinstance(value, pd.DataFrame): return cls.repr_pandas_value(value, max_len) + if have_pandas and isinstance(value, pd.Series): + return cls.repr_pandas_series_value(value, max_len) if have_numpy and isinstance(value, np.ndarray): return cls.repr_numpy_value(value, max_len) return cls.repr_python_value(value, max_len) + @classmethod + def repr_pandas_series_value(cls, value: Any, max_len: int = 32) -> str: + """ + Format a pandas Series for golden file. + + Uses tolist() for stable Python-native representation that does not + depend on numpy's string formatting, which can vary across versions. + + Parameters + ---------- + value : pd.Series + The pandas Series to represent. + max_len : int, default 32 + Maximum length for the value string portion. 0 means no limit. + + Returns + ------- + str + "python_list_repr@Series[dtype]" + """ + v_str = str(value.tolist()).replace("\n", " ") + if max_len > 0: + v_str = v_str[:max_len] + return f"{v_str}@Series[{str(value.dtype)}]" + @staticmethod def clean_result(result: str) -> str: """Clean result string by removing newlines and extra whitespace.""" diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv new file mode 100644 index 0000000000000..13450e5634edf --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.csv @@ -0,0 +1,122 @@ +test case pyarrow array pandas series +int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8] +int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64] +int8:empty []@int8 []@Series[int8] +int16:standard [0, 1, -1, 32767, -32768]@int16 [0, 1, -1, 32767, -32768]@Series[int16] +int16:nullable [0, 1, None]@int16 [0.0, 1.0, nan]@Series[float64] +int16:empty []@int16 []@Series[int16] +int32:standard [0, 1, -1, 2147483647, -2147483648]@int32 [0, 1, -1, 2147483647, -2147483648]@Series[int32] +int32:nullable [0, 1, None]@int32 [0.0, 1.0, nan]@Series[float64] +int32:empty []@int32 []@Series[int32] +int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@int64 [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] +int64:nullable [0, 1, None]@int64 [0.0, 1.0, nan]@Series[float64] +int64:empty []@int64 []@Series[int64] +uint8:standard [0, 1, 255]@uint8 [0, 1, 255]@Series[uint8] +uint8:nullable [0, 1, None]@uint8 [0.0, 1.0, nan]@Series[float64] +uint8:empty []@uint8 []@Series[uint8] +uint16:standard [0, 1, 65535]@uint16 [0, 1, 65535]@Series[uint16] +uint16:nullable [0, 1, None]@uint16 [0.0, 1.0, nan]@Series[float64] +uint16:empty []@uint16 []@Series[uint16] +uint32:standard [0, 1, 4294967295]@uint32 [0, 1, 4294967295]@Series[uint32] +uint32:nullable [0, 1, None]@uint32 [0.0, 1.0, nan]@Series[float64] +uint32:empty []@uint32 []@Series[uint32] +uint64:standard [0, 1, 18446744073709551615]@uint64 [0, 1, 18446744073709551615]@Series[uint64] +uint64:nullable [0, 1, None]@uint64 [0.0, 1.0, nan]@Series[float64] +uint64:empty []@uint64 []@Series[uint64] +float32:standard [0.0, 1.5, -1.5]@float32 [0.0, 1.5, -1.5]@Series[float32] +float32:nullable [0.0, 1.5, None]@float32 [0.0, 1.5, nan]@Series[float32] +float32:empty []@float32 []@Series[float32] +float64:standard [0.0, 1.5, -1.5]@float64 [0.0, 1.5, -1.5]@Series[float64] +float64:nullable [0.0, 1.5, None]@float64 [0.0, 1.5, nan]@Series[float64] +float64:special [nan, inf, -inf]@float64 [nan, inf, -inf]@Series[float64] +float64:empty []@float64 []@Series[float64] +bool:standard [True, False, True]@bool [True, False, True]@Series[bool] +bool:nullable [True, False, None]@bool [True, False, None]@Series[object] +bool:empty []@bool []@Series[bool] +string:standard [hello, world, ]@string ['hello', 'world', '']@Series[object] +string:nullable [hello, None, world]@string ['hello', None, 'world']@Series[object] +string:empty []@string []@Series[object] +large_string:standard [hello, world]@large_string ['hello', 'world']@Series[object] +large_string:nullable [hello, None]@large_string ['hello', None]@Series[object] +large_string:empty []@large_string []@Series[object] +binary:standard [b'hello', b'world']@binary [b'hello', b'world']@Series[object] +binary:nullable [b'hello', None]@binary [b'hello', None]@Series[object] +binary:empty []@binary []@Series[object] +large_binary:standard [b'hello', b'world']@large_binary [b'hello', b'world']@Series[object] +large_binary:nullable [b'hello', None]@large_binary [b'hello', None]@Series[object] +large_binary:empty []@large_binary []@Series[object] +decimal128:standard [1.23, 4.56, -7.89]@decimal128(5, 2) [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] +decimal128:nullable [1.23, None, 4.56]@decimal128(5, 2) [Decimal('1.23'), None, Decimal('4.56')]@Series[object] +decimal128:empty []@decimal128(5, 2) []@Series[object] +date32:standard [2024-01-01, 2024-06-15]@date32[day] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date32:nullable [2024-01-01, None]@date32[day] [datetime.date(2024, 1, 1), None]@Series[object] +date32:empty []@date32[day] []@Series[object] +date64:standard [2024-01-01, 2024-06-15]@date64[ms] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] +date64:nullable [2024-01-01, None]@date64[ms] [datetime.date(2024, 1, 1), None]@Series[object] +date64:empty []@date64[ms] []@Series[object] +timestamp[s]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] +timestamp[s]:nullable [2024-01-01 12:00:00, None]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] +timestamp[s]:empty []@timestamp[s] []@Series[datetime64[s]] +timestamp[ms]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] +timestamp[ms]:nullable [2024-01-01 12:00:00, None]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] +timestamp[ms]:empty []@timestamp[ms] []@Series[datetime64[ms]] +timestamp[us]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] +timestamp[us]:nullable [2024-01-01 12:00:00, None]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] +timestamp[us]:empty []@timestamp[us] []@Series[datetime64[us]] +timestamp[ns]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] +timestamp[ns]:nullable [2024-01-01 12:00:00, None]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] +timestamp[ns]:empty []@timestamp[ns] []@Series[datetime64[ns]] +timestamp[us,tz=UTC]:standard [2024-01-01 12:00:00+00:00, 2024-06-15 18:30:00+00:00]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:nullable [2024-01-01 12:00:00+00:00, None]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] +timestamp[us,tz=UTC]:empty []@timestamp[us, tz=UTC] []@Series[datetime64[us, UTC]] +duration[s]:standard [1 day, 0:00:00, 2:30:00]@duration[s] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] +duration[s]:nullable [1 day, 0:00:00, None]@duration[s] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] +duration[s]:empty []@duration[s] []@Series[timedelta64[s]] +duration[ms]:standard [1 day, 0:00:00, 2:30:00]@duration[ms] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] +duration[ms]:nullable [1 day, 0:00:00, None]@duration[ms] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] +duration[ms]:empty []@duration[ms] []@Series[timedelta64[ms]] +duration[us]:standard [1 day, 0:00:00, 2:30:00]@duration[us] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] +duration[us]:nullable [1 day, 0:00:00, None]@duration[us] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] +duration[us]:empty []@duration[us] []@Series[timedelta64[us]] +duration[ns]:standard [1 days 00:00:00, 0 days 02:30:00]@duration[ns] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] +duration[ns]:nullable [1 days 00:00:00, None]@duration[ns] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] +duration[ns]:empty []@duration[ns] []@Series[timedelta64[ns]] +time32[s]:standard [12:30:00, 18:45:30]@time32[s] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[s]:nullable [12:30:00, None]@time32[s] [datetime.time(12, 30), None]@Series[object] +time32[s]:empty []@time32[s] []@Series[object] +time32[ms]:standard [12:30:00, 18:45:30]@time32[ms] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time32[ms]:nullable [12:30:00, None]@time32[ms] [datetime.time(12, 30), None]@Series[object] +time32[ms]:empty []@time32[ms] []@Series[object] +time64[us]:standard [12:30:00, 18:45:30]@time64[us] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[us]:nullable [12:30:00, None]@time64[us] [datetime.time(12, 30), None]@Series[object] +time64[us]:empty []@time64[us] []@Series[object] +time64[ns]:standard [12:30:00, 18:45:30]@time64[ns] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] +time64[ns]:nullable [12:30:00, None]@time64[ns] [datetime.time(12, 30), None]@Series[object] +time64[ns]:empty []@time64[ns] []@Series[object] +null:standard [None, None, None]@null [None, None, None]@Series[object] +null:empty []@null []@Series[object] +list:standard [[1, 2], [3, 4, 5]]@list [array([1, 2]), array([3, 4, 5])]@Series[object] +list:nullable [[1, 2], None, [3]]@list [array([1, 2]), None, array([3])]@Series[object] +list:empty []@list []@Series[object] +list:standard [['a', 'b'], ['c']]@list [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] +large_list:standard [[1, 2], [3, 4]]@large_list [array([1, 2]), array([3, 4])]@Series[object] +large_list:empty []@large_list []@Series[object] +fixed_size_list[3]:standard [[1, 2, 3], [4, 5, 6]]@fixed_size_list[3] [array([1, 2, 3]), array([4, 5, 6])]@Series[object] +fixed_size_list[3]:empty []@fixed_size_list[3] []@Series[object] +struct:standard [[('x', 1), ('y', 'a')], [('x', 2), ('y', 'b')]]@struct [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] +struct:nullable [[('x', 1), ('y', 'a')], None]@struct [{'x': 1, 'y': 'a'}, None]@Series[object] +struct:empty []@struct []@Series[object] +map:standard [[('a', 1), ('b', 2)], [('c', 3)]]@map [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] +map:empty []@map []@Series[object] +list>:standard [[[1, 2], [3]], [[4, 5, 6]]]@list> [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] +list:standard [[{'x': 1}, {'x': 2}], [{'x': 3}]]@list> [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] +list>:standard [[[('a', 1)], [('b', 2)]], [[('c', 3)]]]@list> [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] +struct:standard [[('outer', {'inner': 1})], [('outer', {'inner': 2})]]@struct> [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] +struct>:standard [[('items', [1, 2, 3])], [('items', [4, 5])]]@struct> [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] +struct>:standard [[('mapping', [('a', 1)])], [('mapping', [('b', 2)])]]@struct> [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] +map>:standard [[('a', [1, 2]), ('b', [3])], [('c', [4, 5, 6])]]@map> [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] +map:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@map> [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] +map>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@map> [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] +dictionary:standard [a, b, a, b]@dictionary ['a', 'b', 'a', 'b']@Series[category] +dictionary:nullable [a, b, None, a]@dictionary ['a', 'b', nan, 'a']@Series[category] +dictionary:empty []@dictionary []@Series[category] diff --git a/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md new file mode 100644 index 0000000000000..04debdc77e03b --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/golden_pyarrow_arrow_to_pandas_default.md @@ -0,0 +1,123 @@ +| test case | pyarrow array | pandas series | +|----------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| int8:standard | [0, 1, -1, 127, -128]@int8 | [0, 1, -1, 127, -128]@Series[int8] | +| int8:nullable | [0, 1, None]@int8 | [0.0, 1.0, nan]@Series[float64] | +| int8:empty | []@int8 | []@Series[int8] | +| int16:standard | [0, 1, -1, 32767, -32768]@int16 | [0, 1, -1, 32767, -32768]@Series[int16] | +| int16:nullable | [0, 1, None]@int16 | [0.0, 1.0, nan]@Series[float64] | +| int16:empty | []@int16 | []@Series[int16] | +| int32:standard | [0, 1, -1, 2147483647, -2147483648]@int32 | [0, 1, -1, 2147483647, -2147483648]@Series[int32] | +| int32:nullable | [0, 1, None]@int32 | [0.0, 1.0, nan]@Series[float64] | +| int32:empty | []@int32 | []@Series[int32] | +| int64:standard | [0, 1, -1, 9223372036854775807, -9223372036854775808]@int64 | [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64] | +| int64:nullable | [0, 1, None]@int64 | [0.0, 1.0, nan]@Series[float64] | +| int64:empty | []@int64 | []@Series[int64] | +| uint8:standard | [0, 1, 255]@uint8 | [0, 1, 255]@Series[uint8] | +| uint8:nullable | [0, 1, None]@uint8 | [0.0, 1.0, nan]@Series[float64] | +| uint8:empty | []@uint8 | []@Series[uint8] | +| uint16:standard | [0, 1, 65535]@uint16 | [0, 1, 65535]@Series[uint16] | +| uint16:nullable | [0, 1, None]@uint16 | [0.0, 1.0, nan]@Series[float64] | +| uint16:empty | []@uint16 | []@Series[uint16] | +| uint32:standard | [0, 1, 4294967295]@uint32 | [0, 1, 4294967295]@Series[uint32] | +| uint32:nullable | [0, 1, None]@uint32 | [0.0, 1.0, nan]@Series[float64] | +| uint32:empty | []@uint32 | []@Series[uint32] | +| uint64:standard | [0, 1, 18446744073709551615]@uint64 | [0, 1, 18446744073709551615]@Series[uint64] | +| uint64:nullable | [0, 1, None]@uint64 | [0.0, 1.0, nan]@Series[float64] | +| uint64:empty | []@uint64 | []@Series[uint64] | +| float32:standard | [0.0, 1.5, -1.5]@float32 | [0.0, 1.5, -1.5]@Series[float32] | +| float32:nullable | [0.0, 1.5, None]@float32 | [0.0, 1.5, nan]@Series[float32] | +| float32:empty | []@float32 | []@Series[float32] | +| float64:standard | [0.0, 1.5, -1.5]@float64 | [0.0, 1.5, -1.5]@Series[float64] | +| float64:nullable | [0.0, 1.5, None]@float64 | [0.0, 1.5, nan]@Series[float64] | +| float64:special | [nan, inf, -inf]@float64 | [nan, inf, -inf]@Series[float64] | +| float64:empty | []@float64 | []@Series[float64] | +| bool:standard | [True, False, True]@bool | [True, False, True]@Series[bool] | +| bool:nullable | [True, False, None]@bool | [True, False, None]@Series[object] | +| bool:empty | []@bool | []@Series[bool] | +| string:standard | [hello, world, ]@string | ['hello', 'world', '']@Series[object] | +| string:nullable | [hello, None, world]@string | ['hello', None, 'world']@Series[object] | +| string:empty | []@string | []@Series[object] | +| large_string:standard | [hello, world]@large_string | ['hello', 'world']@Series[object] | +| large_string:nullable | [hello, None]@large_string | ['hello', None]@Series[object] | +| large_string:empty | []@large_string | []@Series[object] | +| binary:standard | [b'hello', b'world']@binary | [b'hello', b'world']@Series[object] | +| binary:nullable | [b'hello', None]@binary | [b'hello', None]@Series[object] | +| binary:empty | []@binary | []@Series[object] | +| large_binary:standard | [b'hello', b'world']@large_binary | [b'hello', b'world']@Series[object] | +| large_binary:nullable | [b'hello', None]@large_binary | [b'hello', None]@Series[object] | +| large_binary:empty | []@large_binary | []@Series[object] | +| decimal128:standard | [1.23, 4.56, -7.89]@decimal128(5, 2) | [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object] | +| decimal128:nullable | [1.23, None, 4.56]@decimal128(5, 2) | [Decimal('1.23'), None, Decimal('4.56')]@Series[object] | +| decimal128:empty | []@decimal128(5, 2) | []@Series[object] | +| date32:standard | [2024-01-01, 2024-06-15]@date32[day] | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date32:nullable | [2024-01-01, None]@date32[day] | [datetime.date(2024, 1, 1), None]@Series[object] | +| date32:empty | []@date32[day] | []@Series[object] | +| date64:standard | [2024-01-01, 2024-06-15]@date64[ms] | [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object] | +| date64:nullable | [2024-01-01, None]@date64[ms] | [datetime.date(2024, 1, 1), None]@Series[object] | +| date64:empty | []@date64[ms] | []@Series[object] | +| timestamp[s]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[s] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]] | +| timestamp[s]:nullable | [2024-01-01 12:00:00, None]@timestamp[s] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]] | +| timestamp[s]:empty | []@timestamp[s] | []@Series[datetime64[s]] | +| timestamp[ms]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ms] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]] | +| timestamp[ms]:nullable | [2024-01-01 12:00:00, None]@timestamp[ms] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]] | +| timestamp[ms]:empty | []@timestamp[ms] | []@Series[datetime64[ms]] | +| timestamp[us]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[us] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]] | +| timestamp[us]:nullable | [2024-01-01 12:00:00, None]@timestamp[us] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]] | +| timestamp[us]:empty | []@timestamp[us] | []@Series[datetime64[us]] | +| timestamp[ns]:standard | [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ns] | [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]] | +| timestamp[ns]:nullable | [2024-01-01 12:00:00, None]@timestamp[ns] | [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]] | +| timestamp[ns]:empty | []@timestamp[ns] | []@Series[datetime64[ns]] | +| timestamp[us,tz=UTC]:standard | [2024-01-01 12:00:00+00:00, 2024-06-15 18:30:00+00:00]@timestamp[us, tz=UTC] | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:nullable | [2024-01-01 12:00:00+00:00, None]@timestamp[us, tz=UTC] | [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]] | +| timestamp[us,tz=UTC]:empty | []@timestamp[us, tz=UTC] | []@Series[datetime64[us, UTC]] | +| duration[s]:standard | [1 day, 0:00:00, 2:30:00]@duration[s] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]] | +| duration[s]:nullable | [1 day, 0:00:00, None]@duration[s] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]] | +| duration[s]:empty | []@duration[s] | []@Series[timedelta64[s]] | +| duration[ms]:standard | [1 day, 0:00:00, 2:30:00]@duration[ms] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]] | +| duration[ms]:nullable | [1 day, 0:00:00, None]@duration[ms] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]] | +| duration[ms]:empty | []@duration[ms] | []@Series[timedelta64[ms]] | +| duration[us]:standard | [1 day, 0:00:00, 2:30:00]@duration[us] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]] | +| duration[us]:nullable | [1 day, 0:00:00, None]@duration[us] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]] | +| duration[us]:empty | []@duration[us] | []@Series[timedelta64[us]] | +| duration[ns]:standard | [1 days 00:00:00, 0 days 02:30:00]@duration[ns] | [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]] | +| duration[ns]:nullable | [1 days 00:00:00, None]@duration[ns] | [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]] | +| duration[ns]:empty | []@duration[ns] | []@Series[timedelta64[ns]] | +| time32[s]:standard | [12:30:00, 18:45:30]@time32[s] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[s]:nullable | [12:30:00, None]@time32[s] | [datetime.time(12, 30), None]@Series[object] | +| time32[s]:empty | []@time32[s] | []@Series[object] | +| time32[ms]:standard | [12:30:00, 18:45:30]@time32[ms] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time32[ms]:nullable | [12:30:00, None]@time32[ms] | [datetime.time(12, 30), None]@Series[object] | +| time32[ms]:empty | []@time32[ms] | []@Series[object] | +| time64[us]:standard | [12:30:00, 18:45:30]@time64[us] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[us]:nullable | [12:30:00, None]@time64[us] | [datetime.time(12, 30), None]@Series[object] | +| time64[us]:empty | []@time64[us] | []@Series[object] | +| time64[ns]:standard | [12:30:00, 18:45:30]@time64[ns] | [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object] | +| time64[ns]:nullable | [12:30:00, None]@time64[ns] | [datetime.time(12, 30), None]@Series[object] | +| time64[ns]:empty | []@time64[ns] | []@Series[object] | +| null:standard | [None, None, None]@null | [None, None, None]@Series[object] | +| null:empty | []@null | []@Series[object] | +| list:standard | [[1, 2], [3, 4, 5]]@list | [array([1, 2]), array([3, 4, 5])]@Series[object] | +| list:nullable | [[1, 2], None, [3]]@list | [array([1, 2]), None, array([3])]@Series[object] | +| list:empty | []@list | []@Series[object] | +| list:standard | [['a', 'b'], ['c']]@list | [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object] | +| large_list:standard | [[1, 2], [3, 4]]@large_list | [array([1, 2]), array([3, 4])]@Series[object] | +| large_list:empty | []@large_list | []@Series[object] | +| fixed_size_list[3]:standard | [[1, 2, 3], [4, 5, 6]]@fixed_size_list[3] | [array([1, 2, 3]), array([4, 5, 6])]@Series[object] | +| fixed_size_list[3]:empty | []@fixed_size_list[3] | []@Series[object] | +| struct:standard | [[('x', 1), ('y', 'a')], [('x', 2), ('y', 'b')]]@struct | [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object] | +| struct:nullable | [[('x', 1), ('y', 'a')], None]@struct | [{'x': 1, 'y': 'a'}, None]@Series[object] | +| struct:empty | []@struct | []@Series[object] | +| map:standard | [[('a', 1), ('b', 2)], [('c', 3)]]@map | [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object] | +| map:empty | []@map | []@Series[object] | +| list>:standard | [[[1, 2], [3]], [[4, 5, 6]]]@list> | [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object] | +| list:standard | [[{'x': 1}, {'x': 2}], [{'x': 3}]]@list> | [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object] | +| list>:standard | [[[('a', 1)], [('b', 2)]], [[('c', 3)]]]@list> | [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object] | +| struct:standard | [[('outer', {'inner': 1})], [('outer', {'inner': 2})]]@struct> | [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object] | +| struct>:standard | [[('items', [1, 2, 3])], [('items', [4, 5])]]@struct> | [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object] | +| struct>:standard | [[('mapping', [('a', 1)])], [('mapping', [('b', 2)])]]@struct> | [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object] | +| map>:standard | [[('a', [1, 2]), ('b', [3])], [('c', [4, 5, 6])]]@map> | [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object] | +| map:standard | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@map> | [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object] | +| map>:standard | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@map> | [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object] | +| dictionary:standard | [a, b, a, b]@dictionary | ['a', 'b', 'a', 'b']@Series[category] | +| dictionary:nullable | [a, b, None, a]@dictionary | ['a', 'b', nan, 'a']@Series[category] | +| dictionary:empty | []@dictionary | []@Series[category] | \ No newline at end of file diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py new file mode 100644 index 0000000000000..8caac302f2e6c --- /dev/null +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py @@ -0,0 +1,414 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Tests for PyArrow Array.to_pandas() with default arguments using golden file comparison. + +This test monitors the behavior of PyArrow's to_pandas() conversion to ensure +PySpark's assumptions about PyArrow behavior remain valid across versions. + +The test covers conversion of all major Arrow types to pandas/numpy with default +arguments (no types_mapper, no self_destruct, etc.), tracking: +- Which numpy/pandas dtype each Arrow type maps to +- How null values are handled (NaN, None, NaT, etc.) +- Whether values are preserved correctly after conversion + +## Golden File Cell Format + +Each cell uses the value@type format: +- numpy ndarray: "python_list_repr@ndarray[dtype]" +- pandas Series: "python_list_repr@Series[dtype]" +- pandas Categorical: "python_list_repr@Categorical[dtype]" +- Error: "ERR@ExceptionClassName" + +Values are formatted via tolist() for stable, Python-native representation. + +## Regenerating Golden Files + +Set SPARK_GENERATE_GOLDEN_FILES=1 before running: + + SPARK_GENERATE_GOLDEN_FILES=1 python -m pytest \\ + python/pyspark/tests/upstream/pyarrow/test_pyarrow_arrow_to_pandas_default.py + +## PyArrow Version Compatibility + +The golden files capture behavior for a specific PyArrow version. +Regenerate when upgrading PyArrow, as to_pandas() behavior may change. +""" + +import datetime +import inspect +import os +import unittest +from decimal import Decimal +from typing import Callable, List, Optional + +from pyspark.testing.utils import ( + have_pyarrow, + have_pandas, + have_numpy, + pyarrow_requirement_message, + pandas_requirement_message, + numpy_requirement_message, +) +from pyspark.testing.goldenutils import GoldenFileTestMixin + + +@unittest.skipIf( + not have_pyarrow or not have_pandas or not have_numpy, + pyarrow_requirement_message or pandas_requirement_message or numpy_requirement_message, +) +class PyArrowArrayToPandasDefaultTests(GoldenFileTestMixin, unittest.TestCase): + """ + Tests pa.Array.to_pandas() with default arguments via golden file comparison. + + Covers all major Arrow types: integers, floats, bool, string, binary, + decimal, date, timestamp, duration, time, null, and nested types. + Each type is tested both without and with null values. + """ + + def compare_or_generate_golden_matrix( + self, + row_names: List[str], + col_names: List[str], + compute_cell: Callable[[str, str], str], + golden_file_prefix: str, + index_name: str = "source \\ target", + overrides: Optional[dict[tuple[str, str], str]] = None, + ) -> None: + """ + Run a matrix of computations and compare against (or generate) a golden file. + + 1. If SPARK_GENERATE_GOLDEN_FILES=1, compute every cell, build a + DataFrame, and save it as the new golden CSV / Markdown file. + 2. Otherwise, load the existing golden file and assert that every cell + matches the freshly computed value. + """ + generating = self.is_generating_golden() + + test_dir = os.path.dirname(inspect.getfile(type(self))) + golden_csv = os.path.join(test_dir, f"{golden_file_prefix}.csv") + golden_md = os.path.join(test_dir, f"{golden_file_prefix}.md") + + golden = None + if not generating: + golden = self.load_golden_csv(golden_csv) + + errors = [] + results = {} + + for row_name in row_names: + for col_name in col_names: + result = compute_cell(row_name, col_name) + results[(row_name, col_name)] = result + + if not generating: + if overrides and (row_name, col_name) in overrides: + expected = overrides[(row_name, col_name)] + else: + expected = golden.loc[row_name, col_name] + if expected != result: + errors.append( + f"{row_name} -> {col_name}: expected '{expected}', got '{result}'" + ) + + if generating: + import pandas as pd + + index = pd.Index(row_names, name=index_name) + df = pd.DataFrame(index=index) + for col_name in col_names: + df[col_name] = [results[(row, col_name)] for row in row_names] + self.save_golden(df, golden_csv, golden_md) + else: + self.assertEqual( + len(errors), + 0, + f"\n{len(errors)} golden file mismatches:\n" + "\n".join(errors), + ) + + def _build_source_arrays(self): + """Build an ordered dict of named source PyArrow arrays for testing.""" + import pyarrow as pa + + sources = {} + + # ===================================================================== + # Integer types + # ===================================================================== + for bits, pa_type in [ + (8, pa.int8()), + (16, pa.int16()), + (32, pa.int32()), + (64, pa.int64()), + ]: + max_val = 2 ** (bits - 1) - 1 + min_val = -(2 ** (bits - 1)) + sources[f"int{bits}:standard"] = pa.array([0, 1, -1, max_val, min_val], pa_type) + sources[f"int{bits}:nullable"] = pa.array([0, 1, None], pa_type) + sources[f"int{bits}:empty"] = pa.array([], pa_type) + + for bits, pa_type in [ + (8, pa.uint8()), + (16, pa.uint16()), + (32, pa.uint32()), + (64, pa.uint64()), + ]: + max_val = 2**bits - 1 + sources[f"uint{bits}:standard"] = pa.array([0, 1, max_val], pa_type) + sources[f"uint{bits}:nullable"] = pa.array([0, 1, None], pa_type) + sources[f"uint{bits}:empty"] = pa.array([], pa_type) + + # ===================================================================== + # Float types + # ===================================================================== + sources["float32:standard"] = pa.array([0.0, 1.5, -1.5], pa.float32()) + sources["float32:nullable"] = pa.array([0.0, 1.5, None], pa.float32()) + sources["float32:empty"] = pa.array([], pa.float32()) + sources["float64:standard"] = pa.array([0.0, 1.5, -1.5], pa.float64()) + sources["float64:nullable"] = pa.array([0.0, 1.5, None], pa.float64()) + sources["float64:special"] = pa.array( + [float("nan"), float("inf"), float("-inf")], pa.float64() + ) + sources["float64:empty"] = pa.array([], pa.float64()) + + # ===================================================================== + # Boolean + # ===================================================================== + sources["bool:standard"] = pa.array([True, False, True], pa.bool_()) + sources["bool:nullable"] = pa.array([True, False, None], pa.bool_()) + sources["bool:empty"] = pa.array([], pa.bool_()) + + # ===================================================================== + # String types + # ===================================================================== + sources["string:standard"] = pa.array(["hello", "world", ""], pa.string()) + sources["string:nullable"] = pa.array(["hello", None, "world"], pa.string()) + sources["string:empty"] = pa.array([], pa.string()) + sources["large_string:standard"] = pa.array(["hello", "world"], pa.large_string()) + sources["large_string:nullable"] = pa.array(["hello", None], pa.large_string()) + sources["large_string:empty"] = pa.array([], pa.large_string()) + + # ===================================================================== + # Binary types + # ===================================================================== + sources["binary:standard"] = pa.array([b"hello", b"world"], pa.binary()) + sources["binary:nullable"] = pa.array([b"hello", None], pa.binary()) + sources["binary:empty"] = pa.array([], pa.binary()) + sources["large_binary:standard"] = pa.array([b"hello", b"world"], pa.large_binary()) + sources["large_binary:nullable"] = pa.array([b"hello", None], pa.large_binary()) + sources["large_binary:empty"] = pa.array([], pa.large_binary()) + + # ===================================================================== + # Decimal + # ===================================================================== + sources["decimal128:standard"] = pa.array( + [Decimal("1.23"), Decimal("4.56"), Decimal("-7.89")], + pa.decimal128(5, 2), + ) + sources["decimal128:nullable"] = pa.array( + [Decimal("1.23"), None, Decimal("4.56")], pa.decimal128(5, 2) + ) + sources["decimal128:empty"] = pa.array([], pa.decimal128(5, 2)) + + # ===================================================================== + # Date types + # ===================================================================== + d1 = datetime.date(2024, 1, 1) + d2 = datetime.date(2024, 6, 15) + sources["date32:standard"] = pa.array([d1, d2], pa.date32()) + sources["date32:nullable"] = pa.array([d1, None], pa.date32()) + sources["date32:empty"] = pa.array([], pa.date32()) + sources["date64:standard"] = pa.array([d1, d2], pa.date64()) + sources["date64:nullable"] = pa.array([d1, None], pa.date64()) + sources["date64:empty"] = pa.array([], pa.date64()) + + # ===================================================================== + # Timestamp types + # ===================================================================== + dt1 = datetime.datetime(2024, 1, 1, 12, 0, 0) + dt2 = datetime.datetime(2024, 6, 15, 18, 30, 0) + for unit in ["s", "ms", "us", "ns"]: + sources[f"timestamp[{unit}]:standard"] = pa.array([dt1, dt2], pa.timestamp(unit)) + sources[f"timestamp[{unit}]:nullable"] = pa.array([dt1, None], pa.timestamp(unit)) + sources[f"timestamp[{unit}]:empty"] = pa.array([], pa.timestamp(unit)) + # Timestamp with timezone + sources["timestamp[us,tz=UTC]:standard"] = pa.array( + [dt1, dt2], pa.timestamp("us", tz="UTC") + ) + sources["timestamp[us,tz=UTC]:nullable"] = pa.array( + [dt1, None], pa.timestamp("us", tz="UTC") + ) + sources["timestamp[us,tz=UTC]:empty"] = pa.array([], pa.timestamp("us", tz="UTC")) + + # ===================================================================== + # Duration types + # ===================================================================== + td1 = datetime.timedelta(days=1) + td2 = datetime.timedelta(hours=2, minutes=30) + for unit in ["s", "ms", "us", "ns"]: + sources[f"duration[{unit}]:standard"] = pa.array([td1, td2], pa.duration(unit)) + sources[f"duration[{unit}]:nullable"] = pa.array([td1, None], pa.duration(unit)) + sources[f"duration[{unit}]:empty"] = pa.array([], pa.duration(unit)) + + # ===================================================================== + # Time types + # ===================================================================== + t1 = datetime.time(12, 30, 0) + t2 = datetime.time(18, 45, 30) + sources["time32[s]:standard"] = pa.array([t1, t2], pa.time32("s")) + sources["time32[s]:nullable"] = pa.array([t1, None], pa.time32("s")) + sources["time32[s]:empty"] = pa.array([], pa.time32("s")) + sources["time32[ms]:standard"] = pa.array([t1, t2], pa.time32("ms")) + sources["time32[ms]:nullable"] = pa.array([t1, None], pa.time32("ms")) + sources["time32[ms]:empty"] = pa.array([], pa.time32("ms")) + sources["time64[us]:standard"] = pa.array([t1, t2], pa.time64("us")) + sources["time64[us]:nullable"] = pa.array([t1, None], pa.time64("us")) + sources["time64[us]:empty"] = pa.array([], pa.time64("us")) + sources["time64[ns]:standard"] = pa.array([t1, t2], pa.time64("ns")) + sources["time64[ns]:nullable"] = pa.array([t1, None], pa.time64("ns")) + sources["time64[ns]:empty"] = pa.array([], pa.time64("ns")) + + # ===================================================================== + # Null type + # ===================================================================== + sources["null:standard"] = pa.array([None, None, None], pa.null()) + sources["null:empty"] = pa.array([], pa.null()) + + # ===================================================================== + # Nested types + # ===================================================================== + sources["list:standard"] = pa.array([[1, 2], [3, 4, 5]], pa.list_(pa.int64())) + sources["list:nullable"] = pa.array([[1, 2], None, [3]], pa.list_(pa.int64())) + sources["list:empty"] = pa.array([], pa.list_(pa.int64())) + sources["list:standard"] = pa.array([["a", "b"], ["c"]], pa.list_(pa.string())) + sources["large_list:standard"] = pa.array( + [[1, 2], [3, 4]], pa.large_list(pa.int64()) + ) + sources["large_list:empty"] = pa.array([], pa.large_list(pa.int64())) + sources["fixed_size_list[3]:standard"] = pa.array( + [[1, 2, 3], [4, 5, 6]], pa.list_(pa.int64(), 3) + ) + sources["fixed_size_list[3]:empty"] = pa.array([], pa.list_(pa.int64(), 3)) + sources["struct:standard"] = pa.array( + [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], + pa.struct([("x", pa.int64()), ("y", pa.string())]), + ) + sources["struct:nullable"] = pa.array( + [{"x": 1, "y": "a"}, None], + pa.struct([("x", pa.int64()), ("y", pa.string())]), + ) + sources["struct:empty"] = pa.array([], pa.struct([("x", pa.int64()), ("y", pa.string())])) + sources["map:standard"] = pa.array( + [[("a", 1), ("b", 2)], [("c", 3)]], + pa.map_(pa.string(), pa.int64()), + ) + sources["map:empty"] = pa.array([], pa.map_(pa.string(), pa.int64())) + # list of list (nested list) + sources["list>:standard"] = pa.array( + [[[1, 2], [3]], [[4, 5, 6]]], + pa.list_(pa.list_(pa.int64())), + ) + # list of struct + sources["list:standard"] = pa.array( + [[{"x": 1}, {"x": 2}], [{"x": 3}]], + pa.list_(pa.struct([("x", pa.int64())])), + ) + # list of map + sources["list>:standard"] = pa.array( + [[[("a", 1)], [("b", 2)]], [[("c", 3)]]], + pa.list_(pa.map_(pa.string(), pa.int64())), + ) + # struct of struct + sources["struct:standard"] = pa.array( + [{"outer": {"inner": 1}}, {"outer": {"inner": 2}}], + pa.struct([("outer", pa.struct([("inner", pa.int64())]))]), + ) + # struct of list + sources["struct>:standard"] = pa.array( + [{"items": [1, 2, 3]}, {"items": [4, 5]}], + pa.struct([("items", pa.list_(pa.int64()))]), + ) + # struct of map + sources["struct>:standard"] = pa.array( + [{"mapping": [("a", 1)]}, {"mapping": [("b", 2)]}], + pa.struct([("mapping", pa.map_(pa.string(), pa.int64()))]), + ) + # map with list values + sources["map>:standard"] = pa.array( + [[("a", [1, 2]), ("b", [3])], [("c", [4, 5, 6])]], + pa.map_(pa.string(), pa.list_(pa.int64())), + ) + # map with struct values + sources["map:standard"] = pa.array( + [[("a", {"v": 1}), ("b", {"v": 2})], [("c", {"v": 3})]], + pa.map_(pa.string(), pa.struct([("v", pa.int64())])), + ) + # map of map (map with map values) + sources["map>:standard"] = pa.array( + [[("a", [("x", 1)]), ("b", [("y", 2)])], [("c", [("z", 3)])]], + pa.map_(pa.string(), pa.map_(pa.string(), pa.int64())), + ) + + # ===================================================================== + # Dictionary type + # ===================================================================== + sources["dictionary:standard"] = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 0, 1], pa.int32()), + pa.array(["a", "b"], pa.string()), + ) + sources["dictionary:nullable"] = pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0], pa.int32()), + pa.array(["a", "b"], pa.string()), + ) + sources["dictionary:empty"] = pa.DictionaryArray.from_arrays( + pa.array([], pa.int32()), + pa.array([], pa.string()), + ) + + return sources + + def test_to_pandas_default(self): + """Test pa.Array.to_pandas() with default arguments against golden file.""" + sources = self._build_source_arrays() + row_names = list(sources.keys()) + col_names = ["pyarrow array", "pandas series"] + + def compute_cell(row_name, col_name): + arr = sources[row_name] + if col_name == "pyarrow array": + return self.repr_value(arr, max_len=0) + else: + try: + result = arr.to_pandas() + return self.repr_value(result, max_len=0) + except Exception as e: + return f"ERR@{type(e).__name__}" + + self.compare_or_generate_golden_matrix( + row_names=row_names, + col_names=col_names, + compute_cell=compute_cell, + golden_file_prefix="golden_pyarrow_arrow_to_pandas_default", + index_name="test case", + ) + + +if __name__ == "__main__": + from pyspark.testing import main + + main()