-
-
Notifications
You must be signed in to change notification settings - Fork 19.4k
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
Changes from all commits
1f8c628
906f1e4
497dc7e
583aca6
e069810
b90726f
65a506c
ab3b8b9
e93ed83
ec4d97e
ebfc3b0
8cfacef
7ef7fb2
588808a
c484552
216475c
e997747
4184167
8bcd3dc
2673281
ff48847
1c69e29
8b26a7d
a625520
259424e
f141e6a
d2d0f71
858d0c2
a8c88c7
1a472e3
71305aa
89e30cc
9aa9ece
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| """ | ||
| Module for correlation related implementation | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING | ||
|
|
||
| import numpy as np | ||
|
|
||
| from pandas.core.dtypes.dtypes import CategoricalDtype | ||
|
|
||
| if TYPE_CHECKING: | ||
| from pandas import DataFrame | ||
|
|
||
|
|
||
| def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: | ||
| """ | ||
| Replace ordered categoricals with their codes, making a shallow copy if necessary. | ||
| """ | ||
|
|
||
| result = df | ||
| made_copy = False | ||
| for idx, dtype in enumerate(df.dtypes): | ||
| if not isinstance(dtype, CategoricalDtype) or not dtype.ordered: | ||
| continue | ||
| col = result._ixs(idx, axis=1) | ||
| if not made_copy: | ||
| made_copy = True | ||
| result = result.copy(deep=False) | ||
| result._iset_item(idx, col.cat.codes.replace(-1, np.nan)) | ||
| return result |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| from itertools import combinations | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
|
|
||
|
|
@@ -252,6 +254,63 @@ def test_corr_numeric_only(self, meth, numeric_only): | |
| with pytest.raises(ValueError, match="could not convert string to float"): | ||
| df.corr(meth, numeric_only=numeric_only) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @td.skip_if_no("scipy") | ||
| def test_corr_rank_ordered_categorical( | ||
| self, | ||
| method, | ||
| ): | ||
| # GH #60306 | ||
| df = DataFrame( | ||
| { | ||
| "ord_cat": pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
| "ord_cat_none": pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| "ord_cat_shuff": pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
|
Comment on lines
+266
to
+280
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use 2 |
||
| } | ||
| ) | ||
| corr_calc = df.corr(method=method) | ||
| for col1, col2 in combinations(df.columns, r=2): | ||
| corr_expected = df[col1].corr(df[col2], method=method) | ||
| tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @td.skip_if_no("scipy") | ||
| def test_corr_rank_ordered_categorical_duplicate_columns( | ||
| self, | ||
| method, | ||
| ): | ||
| # GH #60306 | ||
| cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True) | ||
| df = DataFrame( | ||
| { | ||
| "a": pd.array([1, 2, 3, 4], dtype=cat), | ||
| "b": pd.array([4, 3, 2, 1], dtype=cat), | ||
| "c": [4, 3, 2, 1], | ||
| "d": [10, 20, 30, 40], | ||
| "e": [100, 200, 300, 400], | ||
|
Comment on lines
+298
to
+302
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here |
||
| } | ||
| ) | ||
| df.columns = ["a", "a", "c", "c", "e"] | ||
|
|
||
| corr_calc = df.corr(method=method) | ||
| for col1_idx, col2_idx in combinations(range(len(df.columns)), r=2): | ||
| corr_expected = df.iloc[:, col1_idx].corr( | ||
| df.iloc[:, col2_idx], method=method | ||
| ) | ||
| tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected) | ||
|
|
||
|
|
||
| class TestDataFrameCorrWith: | ||
| @pytest.mark.parametrize( | ||
|
|
@@ -493,3 +552,51 @@ def test_cov_with_missing_values(self): | |
| result2 = df.dropna().cov() | ||
| tm.assert_frame_equal(result1, expected) | ||
| tm.assert_frame_equal(result2, expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| def test_corr_rank_ordered_categorical( | ||
| self, | ||
| method, | ||
| ): | ||
| # GH #60306 | ||
| pytest.importorskip("scipy") | ||
| df1 = DataFrame( | ||
| { | ||
| "a": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "b": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "c": Series([0, 1, 2, 3]), | ||
| "d": Series([2.0, 3.0, 4.5, 6.5]), | ||
|
Comment on lines
+565
to
+580
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. And could you remove the call to |
||
| } | ||
| ) | ||
|
|
||
| df2 = DataFrame( | ||
| { | ||
| "a": Series([2.0, 3.0, 4.5, np.nan]), | ||
| "b": Series( | ||
| pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "c": Series([2, 3, 0, 1]), | ||
| "d": Series([2.0, 3.0, 4.5, 6.5]), | ||
| } | ||
| ) | ||
|
|
||
| corr_calc = df1.corrwith(df2, method=method) | ||
| for col in df1.columns: | ||
| corr_expected = df1[col].corr(df2[col], method=method) | ||
| tm.assert_almost_equal(corr_calc.get(col), corr_expected) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| """ | ||
| Tests for core/methods/corr.py | ||
| """ | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
|
|
||
| from pandas import ( | ||
| Categorical, | ||
| DataFrame, | ||
| Series, | ||
| ) | ||
| import pandas._testing as tm | ||
| from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ("input_df", "expected_df"), | ||
| [ | ||
| pytest.param( | ||
| # 1) Simple: two ordered categorical columns (with and without None) | ||
| DataFrame( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And the |
||
| { | ||
| "ord_cat": Series( | ||
| Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "ord_cat_none": Series( | ||
| Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| } | ||
| ), | ||
| DataFrame( | ||
| { | ||
| # codes: low=0, m=1, h=2, vh=3 | ||
| "ord_cat": Series([0, 1, 2, 3], dtype="int8"), | ||
| # codes: low=0, m=1, h=2, None -> NaN | ||
| "ord_cat_none": Series([0, 1.0, 2.0, np.nan]), | ||
| } | ||
| ), | ||
| id="ordered-categoricals-basic", | ||
| ), | ||
| pytest.param( | ||
| # 2) Mixed dtypes: only the ordered categorical should change | ||
| DataFrame( | ||
| { | ||
| "ordered": Series( | ||
| Categorical( | ||
| ["a", "c", "b"], | ||
| categories=["a", "b", "c"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), | ||
| "num": Series([10, 20, 30]), | ||
| "text": Series(["u", "v", "w"]), | ||
| } | ||
| ), | ||
| DataFrame( | ||
| { | ||
| # codes: a=0, c=2, b=1 | ||
| "ordered": Series([0, 2, 1], dtype="int8"), | ||
| # unordered categorical should be untouched (still categorical) | ||
| "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), | ||
| "num": Series([10, 20, 30]), | ||
| "text": Series(["u", "v", "w"]), | ||
| } | ||
| ), | ||
| id="mixed-types-only-ordered-changes", | ||
| ), | ||
| pytest.param( | ||
| # 3 Duplicate column names: first 'dup' is ordered categorical, | ||
| # second 'dup' is non-categorical | ||
| DataFrame( | ||
| { | ||
| "dup_1": Series( | ||
| Categorical( | ||
| ["low", "m", "h"], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "dup_2": Series([5, 6, 7]), # duplicate name, later column | ||
| } | ||
| ), | ||
| DataFrame( | ||
| { | ||
| # After transform: position 0 (ordered cat) becomes codes [0,1,2], | ||
| # position 1 remains untouched numbers [5,6,7]. | ||
| "dup_1": Series([0, 1, 2], dtype="int8"), | ||
| "dup_2": Series([5, 6, 7]), | ||
| } | ||
| ), | ||
| id="duplicate-names-ordered-first", | ||
| ), | ||
| pytest.param( | ||
| # 4 Duplicate column names: first 'dup' is non-categorical, | ||
| # second 'dup' is ordered categorical, third 'dup' is ordered categorical | ||
| DataFrame( | ||
| { | ||
| "dup_1": Series(["a", "b", "c"]), # non-categorical (object) | ||
| "dup_2": Series( | ||
| Categorical( | ||
| ["p", "q", None], | ||
| categories=["p", "q"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "dup_3": Series( | ||
| Categorical( | ||
| ["low", "m", "h"], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| } | ||
| ), | ||
| DataFrame( | ||
| { | ||
| # First stays object; second turns into codes [0, 1, NaN] | ||
| # and third changes into codes [0, 1, 2] | ||
| "dup_1": Series(["a", "b", "c"]), | ||
| "dup_2": Series([0.0, 1.0, np.nan]), | ||
| "dup_3": Series([0, 1, 2], dtype="int8"), | ||
| } | ||
| ), | ||
| id="duplicate-names-ordered-and-non-categorical-and-none", | ||
| ), | ||
| ], | ||
| ) | ||
| def test_transform_ord_cat_cols_to_coded_cols( | ||
| input_df: DataFrame, expected_df: DataFrame | ||
| ) -> None: | ||
| # GH #60306 | ||
| # duplicate columns creation for dup columns | ||
| if "dup_1" in input_df.columns: | ||
| input_df.columns = ["dup" for _ in range(len(input_df.columns))] | ||
| expected_df.columns = ["dup" for _ in range(len(expected_df.columns))] | ||
|
Comment on lines
+143
to
+145
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you make this case a separate test so that the input data can be constructed directly instead of with this work around? |
||
|
|
||
| out_df = transform_ord_cat_cols_to_coded_cols(input_df) | ||
| assert list(out_df.columns) == list(expected_df.columns) | ||
| for i, col in enumerate(out_df.columns): | ||
| tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i]) | ||
|
Comment on lines
+149
to
+150
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add
# GH #60306to the start of each test.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done