Skip to content

Commit c06c3e8

Browse files
authored
Merge pull request #571 from echan5/callable-group-by
Add support for callable group_by
2 parents 316f0c6 + 9a673fe commit c06c3e8

3 files changed

Lines changed: 118 additions & 12 deletions

File tree

deepdiff/diff.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def __init__(self,
157157
exclude_regex_paths: Union[str, List[str], Pattern[str], List[Pattern[str]], None]=None,
158158
exclude_types: Optional[List[type]]=None,
159159
get_deep_distance: bool=False,
160-
group_by: Union[str, Tuple[str, str], None]=None,
160+
group_by: Union[str, Tuple[str, str], Callable, None]=None,
161161
group_by_sort_key: Union[str, Callable, None]=None,
162162
hasher: Optional[Callable]=None,
163163
hashes: Optional[Dict[Any, Any]]=None,
@@ -943,7 +943,7 @@ def _diff_by_forming_pairs_and_comparing_one_by_one(
943943
t2_from_index=None, t2_to_index=None,
944944
):
945945
for (i, j), (x, y) in self._get_matching_pairs(
946-
level,
946+
level,
947947
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
948948
t2_from_index=t2_from_index, t2_to_index=t2_to_index
949949
):
@@ -1835,7 +1835,32 @@ def _get_view_results(self, view, verbose_level=None):
18351835

18361836
@staticmethod
18371837
def _get_key_for_group_by(row, group_by, item_name):
1838+
"""
1839+
Get the key value to group a row by, using the specified group_by parameter.
1840+
1841+
Example
1842+
>>> row = {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}
1843+
>>> DeepDiff._get_key_for_group_by(row, 'first', 't1')
1844+
'John'
1845+
>>> nested_row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}}
1846+
>>> group_by = lambda x: x['demographics']['names']['first']
1847+
>>> DeepDiff._get_key_for_group_by(nested_row, group_by, 't1')
1848+
'John'
1849+
1850+
Args:
1851+
row (dict): The dictionary (row) to extract the group by key from.
1852+
group_by (str or callable): The key name or function to call to get to the key value to group by.
1853+
item_name (str): The name of the item, used for error messages.
1854+
1855+
Returns:
1856+
str: The key value to group by.
1857+
1858+
Raises:
1859+
KeyError: If the specified key is not found in the row.
1860+
"""
18381861
try:
1862+
if callable(group_by):
1863+
return group_by(row)
18391864
return row.pop(group_by)
18401865
except KeyError:
18411866
logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
@@ -1915,13 +1940,13 @@ def affected_paths(self):
19151940
Whether a value was changed or they were added or removed.
19161941
19171942
Example
1943+
>>> from pprint import pprint
19181944
>>> t1 = {1: 1, 2: 2, 3: [3], 4: 4}
19191945
>>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
19201946
>>> ddiff = DeepDiff(t1, t2)
1921-
>>> ddiff
19221947
>>> pprint(ddiff, indent=4)
1923-
{ 'dictionary_item_added': [root[5], root[6]],
1924-
'dictionary_item_removed': [root[4]],
1948+
{ 'dictionary_item_added': ['root[5]', 'root[6]'],
1949+
'dictionary_item_removed': ['root[4]'],
19251950
'iterable_item_added': {'root[3][1]': 4},
19261951
'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}}
19271952
>>> ddiff.affected_paths
@@ -1947,13 +1972,13 @@ def affected_root_keys(self):
19471972
Whether a value was changed or they were added or removed.
19481973
19491974
Example
1975+
>>> from pprint import pprint
19501976
>>> t1 = {1: 1, 2: 2, 3: [3], 4: 4}
19511977
>>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
19521978
>>> ddiff = DeepDiff(t1, t2)
1953-
>>> ddiff
19541979
>>> pprint(ddiff, indent=4)
1955-
{ 'dictionary_item_added': [root[5], root[6]],
1956-
'dictionary_item_removed': [root[4]],
1980+
{ 'dictionary_item_added': ['root[5]', 'root[6]'],
1981+
'dictionary_item_removed': ['root[4]'],
19571982
'iterable_item_added': {'root[3][1]': 4},
19581983
'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}}
19591984
>>> ddiff.affected_paths

docs/basics.rst

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ String difference 2
8989

9090
>>>
9191
>>> print (ddiff['values_changed']["root[4]['b']"]["diff"])
92-
---
93-
+++
92+
---
93+
+++
9494
@@ -1,5 +1,4 @@
9595
-world!
9696
-Goodbye!
@@ -172,7 +172,7 @@ Datetime
172172
Group By
173173
--------
174174

175-
group_by can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
175+
group_by can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys. It is also possible to have a callable group_by, which can be used to access keys in more nested data structures.
176176

177177
For example:
178178
>>> [
@@ -249,14 +249,36 @@ Now we use group_by='id':
249249
'values_changed': {"root['BB']['James']['last_name']": {'new_value': 'Brown',
250250
'old_value': 'Blue'}}}
251251

252+
Callable group_by Example:
253+
>>> from deepdiff import DeepDiff
254+
>>>
255+
>>> t1 = [
256+
... {'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
257+
... {'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Blue'}}},
258+
... {'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Mark', 'last': 'Apple'}}},
259+
... ]
260+
>>>
261+
>>> t2 = [
262+
... {'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
263+
... {'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Brown'}}},
264+
... {'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Charles', 'last': 'Apple'}}},
265+
... ]
266+
>>>
267+
>>> diff = DeepDiff(t1, t2, group_by=lambda x: x['demographics']['names']['first'])
268+
>>> pprint(diff)
269+
{'values_changed': {"root['James']['demographics']['names']['last']": {'new_value': 'Brown',
270+
'old_value': 'Blue'},
271+
"root['Mike']['demographics']['names']['middle']": {'new_value': 'Charles',
272+
'old_value': 'Mark'}}}
273+
252274
.. _group_by_sort_key_label:
253275

254276
Group By - Sort Key
255277
-------------------
256278

257279
group_by_sort_key is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, group_by_sort_key is used to sort between the list.
258280

259-
For example, there are duplicate id values. If we only use group_by='id', one of the dictionaries with id of 'BB' will overwrite the other. However, if we also set group_by_sort_key='name', we keep both dictionaries with the id of 'BB'.
281+
For example, there are duplicate id values. If we only use group_by='id', one of the dictionaries with id of 'BB' will overwrite the other. However, if we also set group_by_sort_key='name', we keep both dictionaries with the id of 'BB'.
260282

261283
Example:
262284
>>> [{'id': 'AA', 'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'},

tests/test_diff_group_by.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Tests for the group_by parameter of Deepdiff"""
2+
3+
import pytest
4+
5+
from deepdiff import DeepDiff
6+
7+
8+
class TestGetKeyForGroupBy:
9+
def test_group_by_string(self):
10+
"""Test where group_by is a single key (string)."""
11+
row = {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}
12+
group_by = 'first'
13+
item_name = 't1'
14+
actual = DeepDiff._get_key_for_group_by(row, group_by, item_name)
15+
expected = 'John'
16+
17+
assert actual == expected
18+
19+
def test_group_by_callable(self):
20+
"""Test where group_by is callable."""
21+
row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}}
22+
group_by = lambda x: x['demographics']['names']['first']
23+
item_name = 't1'
24+
actual = DeepDiff._get_key_for_group_by(row, group_by, item_name)
25+
expected = 'John'
26+
assert actual == expected
27+
28+
def test_group_by_key_error(self):
29+
"""Test where group_by is a key that is not in the row."""
30+
row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}}
31+
group_by = 'someotherkey'
32+
item_name = 't1'
33+
with pytest.raises(KeyError):
34+
DeepDiff._get_key_for_group_by(row, group_by, item_name)
35+
36+
37+
class TestGroupBy:
38+
def test_group_by_callable(self):
39+
"""Test where group_by is a callable."""
40+
t1 = [
41+
{'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
42+
{'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Blue'}}},
43+
{'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Mark', 'last': 'Apple'}}},
44+
]
45+
46+
t2 = [
47+
{'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
48+
{'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Brown'}}},
49+
{'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Charles', 'last': 'Apple'}}},
50+
]
51+
52+
actual = DeepDiff(t1, t2, group_by=lambda x: x['demographics']['names']['first'])
53+
expected = {
54+
'values_changed': {
55+
"root['James']['demographics']['names']['last']": {'new_value': 'Brown', 'old_value': 'Blue'},
56+
"root['Mike']['demographics']['names']['middle']": {'new_value': 'Charles', 'old_value': 'Mark'},
57+
},
58+
}
59+
assert actual == expected

0 commit comments

Comments
 (0)