88import numpy as np
99import pytest
1010
11- from pandas .core .dtypes .common import is_object_dtype
11+ from pandas .core .dtypes .common import (
12+ is_object_dtype ,
13+ is_string_dtype ,
14+ )
1215from pandas .core .dtypes .dtypes import CategoricalDtype
1316
1417import pandas as pd
@@ -316,14 +319,15 @@ def test_merge_copy(self):
316319 merged ["d" ] = "peekaboo"
317320 assert (right ["d" ] == "bar" ).all ()
318321
319- def test_merge_nocopy (self , using_array_manager ):
322+ def test_merge_nocopy (self , using_array_manager , using_infer_string ):
320323 left = DataFrame ({"a" : 0 , "b" : 1 }, index = range (10 ))
321324 right = DataFrame ({"c" : "foo" , "d" : "bar" }, index = range (10 ))
322325
323326 merged = merge (left , right , left_index = True , right_index = True , copy = False )
324327
325328 assert np .shares_memory (merged ["a" ]._values , left ["a" ]._values )
326- assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
329+ if not using_infer_string :
330+ assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
327331
328332 def test_intelligently_handle_join_key (self ):
329333 # #733, be a bit more 1337 about not returning unconsolidated DataFrame
@@ -667,11 +671,13 @@ def test_merge_nan_right(self):
667671 "i1_" : {0 : 0 , 1 : np .nan },
668672 "i3" : {0 : 0.0 , 1 : np .nan },
669673 None : {0 : 0 , 1 : 0 },
670- }
674+ },
675+ columns = Index (["i1" , "i2" , "i1_" , "i3" , None ], dtype = object ),
671676 )
672677 .set_index (None )
673678 .reset_index ()[["i1" , "i2" , "i1_" , "i3" ]]
674679 )
680+ result .columns = result .columns .astype ("object" )
675681 tm .assert_frame_equal (result , expected , check_dtype = False )
676682
677683 def test_merge_nan_right2 (self ):
@@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self):
820826
821827 # #2649, #10639
822828 df2 .columns = ["key1" , "foo" , "foo" ]
823- msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
829+ msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string '\)"
824830 with pytest .raises (MergeError , match = msg ):
825831 merge (df , df2 )
826832
@@ -1498,7 +1504,7 @@ def test_different(self, right_vals):
14981504 # We allow merging on object and categorical cols and cast
14991505 # categorical cols to object
15001506 result = merge (left , right , on = "A" )
1501- assert is_object_dtype (result .A .dtype )
1507+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
15021508
15031509 @pytest .mark .parametrize (
15041510 "d1" , [np .int64 , np .int32 , np .intc , np .int16 , np .int8 , np .uint8 ]
@@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
16371643 result = merge (df1 , df2 , on = ["A" ])
16381644 assert is_object_dtype (result .A .dtype )
16391645 result = merge (df2 , df1 , on = ["A" ])
1640- assert is_object_dtype (result .A .dtype )
1646+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
16411647
16421648 @pytest .mark .parametrize (
16431649 "df1_vals, df2_vals" ,
@@ -1867,25 +1873,27 @@ def right():
18671873
18681874
18691875class TestMergeCategorical :
1870- def test_identical (self , left ):
1876+ def test_identical (self , left , using_infer_string ):
18711877 # merging on the same, should preserve dtypes
18721878 merged = merge (left , left , on = "X" )
18731879 result = merged .dtypes .sort_index ()
1880+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
18741881 expected = Series (
1875- [CategoricalDtype (categories = ["foo" , "bar" ]), np . dtype ( "O" ), np . dtype ( "O" ) ],
1882+ [CategoricalDtype (categories = ["foo" , "bar" ]), dtype , dtype ],
18761883 index = ["X" , "Y_x" , "Y_y" ],
18771884 )
18781885 tm .assert_series_equal (result , expected )
18791886
1880- def test_basic (self , left , right ):
1887+ def test_basic (self , left , right , using_infer_string ):
18811888 # we have matching Categorical dtypes in X
18821889 # so should preserve the merged column
18831890 merged = merge (left , right , on = "X" )
18841891 result = merged .dtypes .sort_index ()
1892+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
18851893 expected = Series (
18861894 [
18871895 CategoricalDtype (categories = ["foo" , "bar" ]),
1888- np . dtype ( "O" ) ,
1896+ dtype ,
18891897 np .dtype ("int64" ),
18901898 ],
18911899 index = ["X" , "Y" , "Z" ],
@@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
19891997 ).set_index (["id" , "p" ])
19901998 tm .assert_frame_equal (result , expected )
19911999
1992- def test_other_columns (self , left , right ):
2000+ def test_other_columns (self , left , right , using_infer_string ):
19932001 # non-merge columns should preserve if possible
19942002 right = right .assign (Z = right .Z .astype ("category" ))
19952003
19962004 merged = merge (left , right , on = "X" )
19972005 result = merged .dtypes .sort_index ()
2006+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
19982007 expected = Series (
19992008 [
20002009 CategoricalDtype (categories = ["foo" , "bar" ]),
2001- np . dtype ( "O" ) ,
2010+ dtype ,
20022011 CategoricalDtype (categories = [1 , 2 ]),
20032012 ],
20042013 index = ["X" , "Y" , "Z" ],
@@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right):
20172026 lambda x : x .astype (CategoricalDtype (ordered = True )),
20182027 ],
20192028 )
2020- def test_dtype_on_merged_different (self , change , join_type , left , right ):
2029+ def test_dtype_on_merged_different (
2030+ self , change , join_type , left , right , using_infer_string
2031+ ):
20212032 # our merging columns, X now has 2 different dtypes
20222033 # so we must be object as a result
20232034
@@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
20292040 merged = merge (left , right , on = "X" , how = join_type )
20302041
20312042 result = merged .dtypes .sort_index ()
2032- expected = Series (
2033- [np .dtype ("O" ), np .dtype ("O" ), np .dtype ("int64" )], index = ["X" , "Y" , "Z" ]
2034- )
2043+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
2044+ expected = Series ([dtype , dtype , np .dtype ("int64" )], index = ["X" , "Y" , "Z" ])
20352045 tm .assert_series_equal (result , expected )
20362046
20372047 def test_self_join_multiple_categories (self ):
@@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns():
24992509 expected_index = MultiIndex .from_tuples (tuples , names = ["outer" , "inner" ])
25002510 expected = DataFrame (columns = expected_index )
25012511
2502- tm .assert_frame_equal (result , expected )
2512+ tm .assert_frame_equal (result , expected , check_dtype = False )
25032513
25042514
25052515def test_merge_datetime_upcast_dtype ():
0 commit comments