@@ -12,19 +12,6 @@ const formatLog = (hit) => {
1212 return [ name , zip , hit . _id ] . filter ( Boolean ) . join ( ' ' ) ;
1313} ;
1414
15- /**
16- * Deduplication workflow:
17- *
18- * 1. iterate over results starting at position 0
19- * 2. on each iteration search for duplicate candidates:
20- * 2.1 at higher positions in array
21- * 2.2 not contained in the skip-list
22- * 3. from the list of candidates, select a preferred master record
23- * 4. push master record on to return array
24- * 5. add non-master candidates to a skip-list
25- * 6. continue down list until end
26- */
27-
2815function dedupeResults ( req , res , next ) {
2916
3017 // do nothing if request data is invalid
@@ -33,72 +20,33 @@ function dedupeResults(req, res, next) {
3320 // do nothing if no result data is invalid
3421 if ( _ . isUndefined ( res ) || ! _ . isArray ( res . data ) || _ . isEmpty ( res . data ) ) { return next ( ) ; }
3522
36- // loop through data items and only copy unique items to unique
37- const unique = [ ] ;
38-
39- // maintain a skip-list
40- const skip = [ ] ;
41-
4223 // use the user agent language to improve deduplication
4324 const lang = _ . get ( req , 'clean.lang.iso6393' ) ;
4425
45- // 1. iterate over res.data
46- res . data . forEach ( ( place , ppos ) => {
47-
48- // skip records in the skip-list
49- if ( skip . includes ( place ) ) { return ; }
50-
51- // 2. search for duplicate candidates
52- const candidates = res . data . filter ( ( candidate , cpos ) => {
53-
54- // 2.1 at higher positions in array
55- if ( cpos <= ppos ) { return false ; }
56-
57- // 2.2 not contained in the skip-list
58- if ( skip . includes ( candidate ) ) { return false ; }
26+ // maintain a set of inferior records (by their array offsets)
27+ const inferior = new Set ( ) ;
28+ for ( var i = 0 ; i < ( res . data . length - 1 ) ; i ++ ) {
29+ for ( var j = ( i + 1 ) ; j < res . data . length ; j ++ ) {
5930
60- // true if the two records are considered duplicates
61- return ! isDifferent ( place , candidate , lang ) ;
62- } ) ;
31+ // ensure these two records are considered duplicates
32+ if ( isDifferent ( res . data [ i ] , res . data [ j ] , lang ) ) { continue ; }
6333
64- // 3. select a preferred master record
34+ // decide which of the two records was 'inferior'
35+ // note: $preference equals true when $j is preferred and vice versa
36+ const preference = isPreferred ( res . data [ i ] , res . data [ j ] ) ;
37+ inferior . add ( preference ? i : j ) ;
6538
66- // simple case where no candidates were found
67- if ( candidates . length === 0 ) {
68- unique . push ( place ) ;
69- return ;
70- }
71-
72- // by default we consider the candidate with the lowest index as master
73- let master = place ;
74-
75- // iterate over candidates looking for one which is preferred to
76- // the currently selected master
77- candidates . forEach ( candidate => {
78- if ( isPreferred ( master , candidate ) ) {
79- master = candidate ;
80- }
81- } ) ;
82-
83- // logging
84- if ( master !== place ) {
39+ // logging
8540 logger . debug ( '[dupe][replacing]' , {
8641 query : req . clean . text ,
87- previous : formatLog ( place ) ,
88- hit : formatLog ( master )
42+ superior : formatLog ( res . data [ preference ? j : i ] ) ,
43+ inferior : formatLog ( res . data [ preference ? i : j ] ) ,
8944 } ) ;
9045 }
46+ }
9147
92- // 4. push master record on to return array
93- unique . push ( master ) ;
94-
95- // 5. add non-master candidates to a skip-list
96- candidates . forEach ( candidate => {
97- skip . push ( candidate ) ;
98- } ) ;
99- } ) ;
100-
101- // replace the original data with only the unique hits
48+ // remove inferior records, return the remaining results
49+ const unique = res . data . filter ( ( v , o ) => ! inferior . has ( o ) ) ;
10250 const maxElements = _ . get ( req , 'clean.size' , undefined ) ;
10351 res . data = unique . slice ( 0 , maxElements ) ;
10452
0 commit comments