Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ def __hash__(self):
# unittests for upstream projects
"pyspark.tests.upstream.pyarrow.test_pyarrow_array_cast",
"pyspark.tests.upstream.pyarrow.test_pyarrow_array_type_inference",
"pyspark.tests.upstream.pyarrow.test_pyarrow_arrow_to_pandas_default",
"pyspark.tests.upstream.pyarrow.test_pyarrow_ignore_timezone",
"pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_coercion",
"pyspark.tests.upstream.pyarrow.test_pyarrow_scalar_type_inference",
Expand Down
27 changes: 27 additions & 0 deletions python/pyspark/testing/goldenutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,38 @@ def repr_value(cls, value: Any, max_len: int = 32) -> str:

if have_pandas and isinstance(value, pd.DataFrame):
return cls.repr_pandas_value(value, max_len)
if have_pandas and isinstance(value, pd.Series):
return cls.repr_pandas_series_value(value, max_len)
if have_numpy and isinstance(value, np.ndarray):
return cls.repr_numpy_value(value, max_len)

return cls.repr_python_value(value, max_len)

@classmethod
def repr_pandas_series_value(cls, value: Any, max_len: int = 32) -> str:
"""
Format a pandas Series for golden file.

Uses tolist() for stable Python-native representation that does not
depend on numpy's string formatting, which can vary across versions.

Parameters
----------
value : pd.Series
The pandas Series to represent.
max_len : int, default 32
Maximum length for the value string portion. 0 means no limit.

Returns
-------
str
"python_list_repr@Series[dtype]"
"""
v_str = str(value.tolist()).replace("\n", " ")
if max_len > 0:
v_str = v_str[:max_len]
return f"{v_str}@Series[{str(value.dtype)}]"

@staticmethod
def clean_result(result: str) -> str:
"""Clean result string by removing newlines and extra whitespace."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
test case pyarrow array pandas series
int8:standard [0, 1, -1, 127, -128]@int8 [0, 1, -1, 127, -128]@Series[int8]
int8:nullable [0, 1, None]@int8 [0.0, 1.0, nan]@Series[float64]
int8:empty []@int8 []@Series[int8]
int16:standard [0, 1, -1, 32767, -32768]@int16 [0, 1, -1, 32767, -32768]@Series[int16]
int16:nullable [0, 1, None]@int16 [0.0, 1.0, nan]@Series[float64]
int16:empty []@int16 []@Series[int16]
int32:standard [0, 1, -1, 2147483647, -2147483648]@int32 [0, 1, -1, 2147483647, -2147483648]@Series[int32]
int32:nullable [0, 1, None]@int32 [0.0, 1.0, nan]@Series[float64]
int32:empty []@int32 []@Series[int32]
int64:standard [0, 1, -1, 9223372036854775807, -9223372036854775808]@int64 [0, 1, -1, 9223372036854775807, -9223372036854775808]@Series[int64]
int64:nullable [0, 1, None]@int64 [0.0, 1.0, nan]@Series[float64]
int64:empty []@int64 []@Series[int64]
uint8:standard [0, 1, 255]@uint8 [0, 1, 255]@Series[uint8]
uint8:nullable [0, 1, None]@uint8 [0.0, 1.0, nan]@Series[float64]
uint8:empty []@uint8 []@Series[uint8]
uint16:standard [0, 1, 65535]@uint16 [0, 1, 65535]@Series[uint16]
uint16:nullable [0, 1, None]@uint16 [0.0, 1.0, nan]@Series[float64]
uint16:empty []@uint16 []@Series[uint16]
uint32:standard [0, 1, 4294967295]@uint32 [0, 1, 4294967295]@Series[uint32]
uint32:nullable [0, 1, None]@uint32 [0.0, 1.0, nan]@Series[float64]
uint32:empty []@uint32 []@Series[uint32]
uint64:standard [0, 1, 18446744073709551615]@uint64 [0, 1, 18446744073709551615]@Series[uint64]
uint64:nullable [0, 1, None]@uint64 [0.0, 1.0, nan]@Series[float64]
uint64:empty []@uint64 []@Series[uint64]
float32:standard [0.0, 1.5, -1.5]@float32 [0.0, 1.5, -1.5]@Series[float32]
float32:nullable [0.0, 1.5, None]@float32 [0.0, 1.5, nan]@Series[float32]
float32:empty []@float32 []@Series[float32]
float64:standard [0.0, 1.5, -1.5]@float64 [0.0, 1.5, -1.5]@Series[float64]
float64:nullable [0.0, 1.5, None]@float64 [0.0, 1.5, nan]@Series[float64]
float64:special [nan, inf, -inf]@float64 [nan, inf, -inf]@Series[float64]
float64:empty []@float64 []@Series[float64]
bool:standard [True, False, True]@bool [True, False, True]@Series[bool]
bool:nullable [True, False, None]@bool [True, False, None]@Series[object]
bool:empty []@bool []@Series[bool]
string:standard [hello, world, ]@string ['hello', 'world', '']@Series[object]
string:nullable [hello, None, world]@string ['hello', None, 'world']@Series[object]
string:empty []@string []@Series[object]
large_string:standard [hello, world]@large_string ['hello', 'world']@Series[object]
large_string:nullable [hello, None]@large_string ['hello', None]@Series[object]
large_string:empty []@large_string []@Series[object]
binary:standard [b'hello', b'world']@binary [b'hello', b'world']@Series[object]
binary:nullable [b'hello', None]@binary [b'hello', None]@Series[object]
binary:empty []@binary []@Series[object]
large_binary:standard [b'hello', b'world']@large_binary [b'hello', b'world']@Series[object]
large_binary:nullable [b'hello', None]@large_binary [b'hello', None]@Series[object]
large_binary:empty []@large_binary []@Series[object]
decimal128:standard [1.23, 4.56, -7.89]@decimal128(5, 2) [Decimal('1.23'), Decimal('4.56'), Decimal('-7.89')]@Series[object]
decimal128:nullable [1.23, None, 4.56]@decimal128(5, 2) [Decimal('1.23'), None, Decimal('4.56')]@Series[object]
decimal128:empty []@decimal128(5, 2) []@Series[object]
date32:standard [2024-01-01, 2024-06-15]@date32[day] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object]
date32:nullable [2024-01-01, None]@date32[day] [datetime.date(2024, 1, 1), None]@Series[object]
date32:empty []@date32[day] []@Series[object]
date64:standard [2024-01-01, 2024-06-15]@date64[ms] [datetime.date(2024, 1, 1), datetime.date(2024, 6, 15)]@Series[object]
date64:nullable [2024-01-01, None]@date64[ms] [datetime.date(2024, 1, 1), None]@Series[object]
date64:empty []@date64[ms] []@Series[object]
timestamp[s]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[s]]
timestamp[s]:nullable [2024-01-01 12:00:00, None]@timestamp[s] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[s]]
timestamp[s]:empty []@timestamp[s] []@Series[datetime64[s]]
timestamp[ms]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ms]]
timestamp[ms]:nullable [2024-01-01 12:00:00, None]@timestamp[ms] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ms]]
timestamp[ms]:empty []@timestamp[ms] []@Series[datetime64[ms]]
timestamp[us]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[us]]
timestamp[us]:nullable [2024-01-01 12:00:00, None]@timestamp[us] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[us]]
timestamp[us]:empty []@timestamp[us] []@Series[datetime64[us]]
timestamp[ns]:standard [2024-01-01 12:00:00, 2024-06-15 18:30:00]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), Timestamp('2024-06-15 18:30:00')]@Series[datetime64[ns]]
timestamp[ns]:nullable [2024-01-01 12:00:00, None]@timestamp[ns] [Timestamp('2024-01-01 12:00:00'), NaT]@Series[datetime64[ns]]
timestamp[ns]:empty []@timestamp[ns] []@Series[datetime64[ns]]
timestamp[us,tz=UTC]:standard [2024-01-01 12:00:00+00:00, 2024-06-15 18:30:00+00:00]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), Timestamp('2024-06-15 18:30:00+0000', tz='UTC')]@Series[datetime64[us, UTC]]
timestamp[us,tz=UTC]:nullable [2024-01-01 12:00:00+00:00, None]@timestamp[us, tz=UTC] [Timestamp('2024-01-01 12:00:00+0000', tz='UTC'), NaT]@Series[datetime64[us, UTC]]
timestamp[us,tz=UTC]:empty []@timestamp[us, tz=UTC] []@Series[datetime64[us, UTC]]
duration[s]:standard [1 day, 0:00:00, 2:30:00]@duration[s] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[s]]
duration[s]:nullable [1 day, 0:00:00, None]@duration[s] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[s]]
duration[s]:empty []@duration[s] []@Series[timedelta64[s]]
duration[ms]:standard [1 day, 0:00:00, 2:30:00]@duration[ms] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ms]]
duration[ms]:nullable [1 day, 0:00:00, None]@duration[ms] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ms]]
duration[ms]:empty []@duration[ms] []@Series[timedelta64[ms]]
duration[us]:standard [1 day, 0:00:00, 2:30:00]@duration[us] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[us]]
duration[us]:nullable [1 day, 0:00:00, None]@duration[us] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[us]]
duration[us]:empty []@duration[us] []@Series[timedelta64[us]]
duration[ns]:standard [1 days 00:00:00, 0 days 02:30:00]@duration[ns] [Timedelta('1 days 00:00:00'), Timedelta('0 days 02:30:00')]@Series[timedelta64[ns]]
duration[ns]:nullable [1 days 00:00:00, None]@duration[ns] [Timedelta('1 days 00:00:00'), NaT]@Series[timedelta64[ns]]
duration[ns]:empty []@duration[ns] []@Series[timedelta64[ns]]
time32[s]:standard [12:30:00, 18:45:30]@time32[s] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object]
time32[s]:nullable [12:30:00, None]@time32[s] [datetime.time(12, 30), None]@Series[object]
time32[s]:empty []@time32[s] []@Series[object]
time32[ms]:standard [12:30:00, 18:45:30]@time32[ms] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object]
time32[ms]:nullable [12:30:00, None]@time32[ms] [datetime.time(12, 30), None]@Series[object]
time32[ms]:empty []@time32[ms] []@Series[object]
time64[us]:standard [12:30:00, 18:45:30]@time64[us] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object]
time64[us]:nullable [12:30:00, None]@time64[us] [datetime.time(12, 30), None]@Series[object]
time64[us]:empty []@time64[us] []@Series[object]
time64[ns]:standard [12:30:00, 18:45:30]@time64[ns] [datetime.time(12, 30), datetime.time(18, 45, 30)]@Series[object]
time64[ns]:nullable [12:30:00, None]@time64[ns] [datetime.time(12, 30), None]@Series[object]
time64[ns]:empty []@time64[ns] []@Series[object]
null:standard [None, None, None]@null [None, None, None]@Series[object]
null:empty []@null []@Series[object]
list<int64>:standard [[1, 2], [3, 4, 5]]@list<item: int64> [array([1, 2]), array([3, 4, 5])]@Series[object]
list<int64>:nullable [[1, 2], None, [3]]@list<item: int64> [array([1, 2]), None, array([3])]@Series[object]
list<int64>:empty []@list<item: int64> []@Series[object]
list<string>:standard [['a', 'b'], ['c']]@list<item: string> [array(['a', 'b'], dtype=object), array(['c'], dtype=object)]@Series[object]
large_list<int64>:standard [[1, 2], [3, 4]]@large_list<item: int64> [array([1, 2]), array([3, 4])]@Series[object]
large_list<int64>:empty []@large_list<item: int64> []@Series[object]
fixed_size_list<int64>[3]:standard [[1, 2, 3], [4, 5, 6]]@fixed_size_list<item: int64>[3] [array([1, 2, 3]), array([4, 5, 6])]@Series[object]
fixed_size_list<int64>[3]:empty []@fixed_size_list<item: int64>[3] []@Series[object]
struct:standard [[('x', 1), ('y', 'a')], [('x', 2), ('y', 'b')]]@struct<x: int64, y: string> [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]@Series[object]
struct:nullable [[('x', 1), ('y', 'a')], None]@struct<x: int64, y: string> [{'x': 1, 'y': 'a'}, None]@Series[object]
struct:empty []@struct<x: int64, y: string> []@Series[object]
map<string,int64>:standard [[('a', 1), ('b', 2)], [('c', 3)]]@map<string, int64> [[('a', 1), ('b', 2)], [('c', 3)]]@Series[object]
map<string,int64>:empty []@map<string, int64> []@Series[object]
list<list<int64>>:standard [[[1, 2], [3]], [[4, 5, 6]]]@list<item: list<item: int64>> [array([array([1, 2]), array([3])], dtype=object), array([array([4, 5, 6])], dtype=object)]@Series[object]
list<struct>:standard [[{'x': 1}, {'x': 2}], [{'x': 3}]]@list<item: struct<x: int64>> [array([{'x': 1}, {'x': 2}], dtype=object), array([{'x': 3}], dtype=object)]@Series[object]
list<map<string,int64>>:standard [[[('a', 1)], [('b', 2)]], [[('c', 3)]]]@list<item: map<string, int64>> [array([list([('a', 1)]), list([('b', 2)])], dtype=object), array([list([('c', 3)])], dtype=object)]@Series[object]
struct<struct>:standard [[('outer', {'inner': 1})], [('outer', {'inner': 2})]]@struct<outer: struct<inner: int64>> [{'outer': {'inner': 1}}, {'outer': {'inner': 2}}]@Series[object]
struct<list<int64>>:standard [[('items', [1, 2, 3])], [('items', [4, 5])]]@struct<items: list<item: int64>> [{'items': array([1, 2, 3])}, {'items': array([4, 5])}]@Series[object]
struct<map<string,int64>>:standard [[('mapping', [('a', 1)])], [('mapping', [('b', 2)])]]@struct<mapping: map<string, int64>> [{'mapping': [('a', 1)]}, {'mapping': [('b', 2)]}]@Series[object]
map<string,list<int64>>:standard [[('a', [1, 2]), ('b', [3])], [('c', [4, 5, 6])]]@map<string, list<item: int64>> [[('a', array([1, 2])), ('b', array([3]))], [('c', array([4, 5, 6]))]]@Series[object]
map<string,struct>:standard [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@map<string, struct<v: int64>> [[('a', {'v': 1}), ('b', {'v': 2})], [('c', {'v': 3})]]@Series[object]
map<string,map<string,int64>>:standard [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@map<string, map<string, int64>> [[('a', [('x', 1)]), ('b', [('y', 2)])], [('c', [('z', 3)])]]@Series[object]
dictionary<int32,string>:standard [a, b, a, b]@dictionary<values=string, indices=int32, ordered=0> ['a', 'b', 'a', 'b']@Series[category]
dictionary<int32,string>:nullable [a, b, None, a]@dictionary<values=string, indices=int32, ordered=0> ['a', 'b', nan, 'a']@Series[category]
dictionary<int32,string>:empty []@dictionary<values=string, indices=int32, ordered=0> []@Series[category]
Loading