From 8667459a6afb6a298099ab51a6b43c0cfd6a4188 Mon Sep 17 00:00:00 2001 From: AKHIL-149 Date: Tue, 9 Dec 2025 13:52:36 -0600 Subject: [PATCH 1/2] Fix pivot_table corruption with large datasets in Python 3.14 This commit addresses issue GH#63314 where pivot_table operations on large datasets produce corrupted output with duplicate index values when running on Python 3.14. The root cause appears to be changes in Python 3.14's hashtable implementation or dictionary behavior. The compress_group_index function was relying on Int64HashTable.get_labels_groupby() which produces incorrect results for large datasets in Python 3.14. The fix uses a numpy-based approach for Python 3.14+ that: - Explicitly sorts the group_index when needed - Uses numpy operations to identify unique values - Maps compressed IDs back to original order - Preserves the existing hashtable-based path for older Python versions Added regression test to ensure pivot_table correctly handles large datasets without producing duplicate indices. --- pandas/core/sorting.py | 34 ++++++++++++++++++++++---- pandas/tests/reshape/test_pivot.py | 38 ++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 930704e6f62f4..77a540f33927e 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -680,14 +680,38 @@ def compress_group_index( space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - if len(group_index) and np.all(group_index[1:] >= group_index[:-1]): + import sys + + # Use numpy-based approach for Python 3.14+ to avoid hashtable issues + if sys.version_info >= (3, 14) or (len(group_index) and np.all(group_index[1:] >= group_index[:-1])): # GH 53806: fast path for sorted group_index + # GH 63314: also use for Python 3.14+ due to hashtable behavior changes + if len(group_index) == 0: + return ensure_int64(np.array([], dtype=np.int64)), ensure_int64(np.array([], dtype=np.int64)) + + # Sort if needed + if not np.all(group_index[1:] >= group_index[:-1]): + sorted_idx = np.argsort(group_index, kind='stable') + sorted_group_index = group_index[sorted_idx] + unsort_idx = np.empty_like(sorted_idx) + unsort_idx[sorted_idx] = np.arange(len(sorted_idx)) + else: + sorted_group_index = group_index + unsort_idx = None + unique_mask = np.concatenate( - [group_index[:1] > -1, group_index[1:] != group_index[:-1]] + [sorted_group_index[:1] > -1, sorted_group_index[1:] != sorted_group_index[:-1]] ) - comp_ids = unique_mask.cumsum() - comp_ids -= 1 - obs_group_ids = group_index[unique_mask] + comp_ids_sorted = unique_mask.cumsum() - 1 + obs_group_ids = sorted_group_index[unique_mask] + + if unsort_idx is not None: + comp_ids = comp_ids_sorted[unsort_idx] + else: + comp_ids = comp_ids_sorted + + if sort and not np.all(obs_group_ids[1:] >= obs_group_ids[:-1]): + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) else: size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5f47fb8f992d0..6b5cfe982871c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2959,3 +2959,41 @@ def test_pivot_empty_dataframe_period_dtype(self, freq): ) tm.assert_frame_equal(result, expected) + + def test_pivot_table_large_dataset_no_duplicates(self): + # GH 63314: pivot_table with large datasets should not produce duplicate indices + # This test ensures that the fix for Python 3.14 hashtable issues works correctly + n_indices = 10000 + metrics = ["apple", "banana", "coconut"] + + data = [] + for i in range(n_indices): + for metric in metrics: + data.append({ + "idx": f"id_{i}", + "metric": metric, + "value": i * 10 + len(metric) + }) + + df = DataFrame(data) + + result = df.pivot_table( + index=["idx"], + columns="metric", + values="value", + aggfunc="first", + ) + + # Verify no duplicate indices in the result + assert len(result.index) == len(result.index.unique()), \ + f"Expected {len(result.index.unique())} unique indices, got {len(result.index)}" + + # Verify we have the expected number of rows + assert len(result) == n_indices, \ + f"Expected {n_indices} rows, got {len(result)}" + + # Verify all expected indices are present + expected_indices = {f"id_{i}" for i in range(n_indices)} + actual_indices = set(result.index) + assert expected_indices == actual_indices, \ + "Result indices don't match expected indices" From c5e3a6063eefeb441bd61079ec0ae5234b39b515 Mon Sep 17 00:00:00 2001 From: AKHIL-149 Date: Tue, 9 Dec 2025 14:49:37 -0600 Subject: [PATCH 2/2] Fix linting errors (line length and code style) - Break long lines to comply with 88 character limit - Use list comprehension instead of append in loop - Improve code readability with multi-line formatting --- pandas/core/sorting.py | 15 +++++++++++---- pandas/tests/reshape/test_pivot.py | 29 +++++++++++++++-------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 77a540f33927e..ee958b15ee78d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -683,15 +683,19 @@ def compress_group_index( import sys # Use numpy-based approach for Python 3.14+ to avoid hashtable issues - if sys.version_info >= (3, 14) or (len(group_index) and np.all(group_index[1:] >= group_index[:-1])): + is_sorted = len(group_index) and np.all( + group_index[1:] >= group_index[:-1] + ) + if sys.version_info >= (3, 14) or is_sorted: # GH 53806: fast path for sorted group_index # GH 63314: also use for Python 3.14+ due to hashtable behavior changes if len(group_index) == 0: - return ensure_int64(np.array([], dtype=np.int64)), ensure_int64(np.array([], dtype=np.int64)) + empty_arr = np.array([], dtype=np.int64) + return ensure_int64(empty_arr), ensure_int64(empty_arr) # Sort if needed if not np.all(group_index[1:] >= group_index[:-1]): - sorted_idx = np.argsort(group_index, kind='stable') + sorted_idx = np.argsort(group_index, kind="stable") sorted_group_index = group_index[sorted_idx] unsort_idx = np.empty_like(sorted_idx) unsort_idx[sorted_idx] = np.arange(len(sorted_idx)) @@ -700,7 +704,10 @@ def compress_group_index( unsort_idx = None unique_mask = np.concatenate( - [sorted_group_index[:1] > -1, sorted_group_index[1:] != sorted_group_index[:-1]] + [ + sorted_group_index[:1] > -1, + sorted_group_index[1:] != sorted_group_index[:-1], + ] ) comp_ids_sorted = unique_mask.cumsum() - 1 obs_group_ids = sorted_group_index[unique_mask] diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6b5cfe982871c..ecfcb289777cf 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2961,19 +2961,16 @@ def test_pivot_empty_dataframe_period_dtype(self, freq): tm.assert_frame_equal(result, expected) def test_pivot_table_large_dataset_no_duplicates(self): - # GH 63314: pivot_table with large datasets should not produce duplicate indices - # This test ensures that the fix for Python 3.14 hashtable issues works correctly + # GH 63314: pivot_table with large datasets should not produce + # duplicate indices. This test ensures the Python 3.14 fix works. n_indices = 10000 metrics = ["apple", "banana", "coconut"] - data = [] - for i in range(n_indices): - for metric in metrics: - data.append({ - "idx": f"id_{i}", - "metric": metric, - "value": i * 10 + len(metric) - }) + data = [ + {"idx": f"id_{i}", "metric": metric, "value": i * 10 + len(metric)} + for i in range(n_indices) + for metric in metrics + ] df = DataFrame(data) @@ -2985,15 +2982,19 @@ def test_pivot_table_large_dataset_no_duplicates(self): ) # Verify no duplicate indices in the result - assert len(result.index) == len(result.index.unique()), \ - f"Expected {len(result.index.unique())} unique indices, got {len(result.index)}" + n_unique = len(result.index.unique()) + assert len(result.index) == n_unique, ( + f"Expected {n_unique} unique indices, got {len(result.index)}" + ) # Verify we have the expected number of rows - assert len(result) == n_indices, \ + assert len(result) == n_indices, ( f"Expected {n_indices} rows, got {len(result)}" + ) # Verify all expected indices are present expected_indices = {f"id_{i}" for i in range(n_indices)} actual_indices = set(result.index) - assert expected_indices == actual_indices, \ + assert expected_indices == actual_indices, ( "Result indices don't match expected indices" + )