Skip to content

Commit

Permalink
feat(dedupe): simplify deduplication logic - preserve order
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Mar 3, 2022
1 parent 560fff8 commit ee7998e
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 9 deletions.
23 changes: 16 additions & 7 deletions middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@ function dedupeResults(req, res, next) {
// use the user agent language to improve deduplication
const lang = _.get(req, 'clean.lang.iso6393');

// maintain a set of inferior records (by their array offsets)
// maintain a set of inferior records
const inferior = new Set();
for (var i = 0; i < (res.data.length-1); i++) {

// maintain a set of superior records
// note: this set maintains ordering of synonymous records
// while also preventing duplicates.
const superior = new Set();

for (var i = 0; i < res.data.length; i++) {
for (var j = (i+1); j < res.data.length; j++) {

// ensure these two records are considered duplicates
Expand All @@ -34,21 +40,24 @@ function dedupeResults(req, res, next) {
// decide which of the two records was 'inferior'
// note: $preference equals true when $j is preferred and vice versa
const preference = isPreferred(res.data[i], res.data[j]);
inferior.add(preference ? i : j);
superior.add(preference ? res.data[j] : res.data[i]);
inferior.add(preference ? res.data[i] : res.data[j]);

// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
superior: formatLog(res.data[preference ? j : i]),
inferior: formatLog(res.data[preference ? i : j]),
superior: formatLog(preference ? res.data[j] : res.data[i]),
inferior: formatLog(preference ? res.data[i] : res.data[j]),
});
}

superior.add(res.data[i]);
}

// remove inferior records, return the remaining results
const unique = res.data.filter((v, o) => !inferior.has(o));
const result = Array.from(superior).filter(v => !inferior.has(v));
const maxElements = _.get(req, 'clean.size', undefined);
res.data = unique.slice(0, maxElements);
res.data = result.slice(0, maxElements);

next();
}
Expand Down
78 changes: 76 additions & 2 deletions test/unit/middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,82 @@ module.exports.tests.priority = function(test, common) {
};

dedupe(req, res, () => {
t.equal(res.data.length, 1, 'results are deduped');
t.equal(res.data[0].source_id, 'A');
t.deepEqual(res.data.map(v => v.source_id), ['A']);
t.end();
});
});

test('A B C->A position substitution', function (t) {
var req = {
clean: {
text: 'A B C',
size: 10
}
};
var res = {
data: [
{
'source': 'example',
'source_id': 'A',
'layer': 'test',
'name': { 'default': ['name2'] }
}, {
'source': 'example',
'source_id': 'B',
'layer': 'test',
'name': { 'default': ['name1'] }
}, {
'source': 'example',
'source_id': 'C',
'layer': 'test',
'name': { 'default': ['name2'] },
'address_parts': { 'zip': '10000' }
}
]
};

dedupe(req, res, () => {
t.deepEqual(res.data.map(v => v.source_id), ['C','B']);
t.end();
});
});

test('A->B C D->A position substitution', function (t) {
var req = {
clean: {
text: 'A B C',
size: 10
}
};
var res = {
data: [
{
'source': 'example',
'source_id': 'A',
'layer': 'test',
'name': { 'default': ['name1'] }
}, {
'source': 'example',
'source_id': 'B',
'layer': 'test',
'name': { 'default': ['name1'] }
}, {
'source': 'example',
'source_id': 'C',
'layer': 'test',
'name': { 'default': ['name2'] }
}, {
'source': 'example',
'source_id': 'D',
'layer': 'test',
'name': { 'default': ['name1'] },
'address_parts': { 'zip': '10000' }
}
]
};

dedupe(req, res, () => {
t.deepEqual(res.data.map(v => v.source_id), ['D', 'C']);
t.end();
});
});
Expand Down

0 comments on commit ee7998e

Please sign in to comment.