Skip to content

Commit 560fff8

Browse files
committed
feat(dedupe): simplify deduplication logic
1 parent bc53aee commit 560fff8

2 files changed

Lines changed: 51 additions & 68 deletions

File tree

middleware/dedupe.js

Lines changed: 16 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,6 @@ const formatLog = (hit) => {
1212
return [name, zip, hit._id].filter(Boolean).join(' ');
1313
};
1414

15-
/**
16-
* Deduplication workflow:
17-
*
18-
* 1. iterate over results starting at position 0
19-
* 2. on each iteration search for duplicate candidates:
20-
* 2.1 at higher positions in array
21-
* 2.2 not contained in the skip-list
22-
* 3. from the list of candidates, select a preferred master record
23-
* 4. push master record on to return array
24-
* 5. add non-master candidates to a skip-list
25-
* 6. continue down list until end
26-
*/
27-
2815
function dedupeResults(req, res, next) {
2916

3017
// do nothing if request data is invalid
@@ -33,72 +20,33 @@ function dedupeResults(req, res, next) {
3320
// do nothing if no result data is invalid
3421
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
3522

36-
// loop through data items and only copy unique items to unique
37-
const unique = [];
38-
39-
// maintain a skip-list
40-
const skip = [];
41-
4223
// use the user agent language to improve deduplication
4324
const lang = _.get(req, 'clean.lang.iso6393');
4425

45-
// 1. iterate over res.data
46-
res.data.forEach((place, ppos) => {
47-
48-
// skip records in the skip-list
49-
if (skip.includes(place)){ return; }
50-
51-
// 2. search for duplicate candidates
52-
const candidates = res.data.filter((candidate, cpos) => {
53-
54-
// 2.1 at higher positions in array
55-
if (cpos <= ppos) { return false; }
56-
57-
// 2.2 not contained in the skip-list
58-
if (skip.includes(candidate)) { return false; }
26+
// maintain a set of inferior records (by their array offsets)
27+
const inferior = new Set();
28+
for (var i = 0; i < (res.data.length-1); i++) {
29+
for (var j = (i+1); j < res.data.length; j++) {
5930

60-
// true if the two records are considered duplicates
61-
return !isDifferent(place, candidate, lang);
62-
});
31+
// ensure these two records are considered duplicates
32+
if (isDifferent(res.data[i], res.data[j], lang)) { continue; }
6333

64-
// 3. select a preferred master record
34+
// decide which of the two records was 'inferior'
35+
// note: $preference equals true when $j is preferred and vice versa
36+
const preference = isPreferred(res.data[i], res.data[j]);
37+
inferior.add(preference ? i : j);
6538

66-
// simple case where no candidates were found
67-
if (candidates.length === 0){
68-
unique.push(place);
69-
return;
70-
}
71-
72-
// by default we consider the candidate with the lowest index as master
73-
let master = place;
74-
75-
// iterate over candidates looking for one which is preferred to
76-
// the currently selected master
77-
candidates.forEach(candidate => {
78-
if (isPreferred(master, candidate)){
79-
master = candidate;
80-
}
81-
});
82-
83-
// logging
84-
if (master !== place) {
39+
// logging
8540
logger.debug('[dupe][replacing]', {
8641
query: req.clean.text,
87-
previous: formatLog(place),
88-
hit: formatLog(master)
42+
superior: formatLog(res.data[preference ? j : i]),
43+
inferior: formatLog(res.data[preference ? i : j]),
8944
});
9045
}
46+
}
9147

92-
// 4. push master record on to return array
93-
unique.push(master);
94-
95-
// 5. add non-master candidates to a skip-list
96-
candidates.forEach(candidate => {
97-
skip.push(candidate);
98-
});
99-
});
100-
101-
// replace the original data with only the unique hits
48+
// remove inferior records, return the remaining results
49+
const unique = res.data.filter((v, o) => !inferior.has(o));
10250
const maxElements = _.get(req, 'clean.size', undefined);
10351
res.data = unique.slice(0, maxElements);
10452

test/unit/middleware/dedupe.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,41 @@ module.exports.tests.priority = function(test, common) {
820820
t.end();
821821
});
822822
});
823+
824+
test('A->B B->C dependency graph', function (t) {
825+
var req = {
826+
clean: {
827+
text: 'A B C',
828+
size: 10
829+
}
830+
};
831+
var res = {
832+
data: [
833+
{
834+
'source': 'example',
835+
'source_id': 'A',
836+
'layer': 'test',
837+
'name': { 'default': ['name1'] }
838+
}, {
839+
'source': 'example',
840+
'source_id': 'B',
841+
'layer': 'test',
842+
'name': { 'default': ['name1', 'name2'] }
843+
}, {
844+
'source': 'example',
845+
'source_id': 'C',
846+
'layer': 'test',
847+
'name': { 'default': ['name2'] }
848+
}
849+
]
850+
};
851+
852+
dedupe(req, res, () => {
853+
t.equal(res.data.length, 1, 'results are deduped');
854+
t.equal(res.data[0].source_id, 'A');
855+
t.end();
856+
});
857+
});
823858
};
824859

825860
module.exports.all = function (tape, common) {

0 commit comments

Comments
 (0)