From 94c9b4a6afc64b73884636e86c0e86471e64c59f Mon Sep 17 00:00:00 2001 From: Jones Magloire Date: Wed, 5 Jun 2019 14:16:25 +0200 Subject: [PATCH] feat(normalizer): Add normalizer for WOF which can be configured to remove accents/hyphen/spaces and do lowercase (#33) --- classifier/WhosOnFirstClassifier.js | 7 ++- package.json | 3 +- resources/whosonfirst/whosonfirst.js | 3 ++ test/address.fra.test.js | 16 +++++++ tokenization/normalizer.js | 22 +++++++++ tokenization/normalizer.test.js | 68 ++++++++++++++++++++++++++++ 6 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 tokenization/normalizer.js create mode 100644 tokenization/normalizer.test.js diff --git a/classifier/WhosOnFirstClassifier.js b/classifier/WhosOnFirstClassifier.js index dfe1e14c..06086f4d 100644 --- a/classifier/WhosOnFirstClassifier.js +++ b/classifier/WhosOnFirstClassifier.js @@ -5,6 +5,7 @@ const CountryClassification = require('../classification/CountryClassification') const RegionClassification = require('../classification/RegionClassification') const LocalityClassification = require('../classification/LocalityClassification') const whosonfirst = require('../resources/whosonfirst/whosonfirst') +const normalize = require('../tokenization/normalizer')({ lowercase: true, removeHyphen: true, removeAccents: true }) // databases sourced from the WhosOnFirst project // see: https://whosonfirst.org @@ -35,7 +36,8 @@ class WhosOnFirstClassifier extends PhraseClassifier { Object.keys(placetypes).forEach(placetype => { this.tokens[placetype] = new Set() whosonfirst.load(this.tokens[placetype], [placetype], placetypes[placetype].files, { - minlength: 2 + minlength: 2, + normalizer: normalize }) // general blacklist @@ -102,8 +104,9 @@ class WhosOnFirstClassifier extends PhraseClassifier { return } + const normalizedSpan = normalize(span.norm) Object.keys(placetypes).forEach(placetype => { - if (this.tokens[placetype].has(span.norm)) { + if (this.tokens[placetype].has(normalizedSpan)) { // do not classify tokens if they already have a 'StopWordClassification' if ( span.classifications.hasOwnProperty('StopWordClassification') || ( diff --git a/package.json b/package.json index 21f34935..cdc8f58f 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,8 @@ }, "dependencies": { "cluster": "^0.7.7", - "express": "^4.16.4" + "express": "^4.16.4", + "remove-accents": "^0.4.2" }, "devDependencies": { "better-sqlite3": "^5.4.0", diff --git a/resources/whosonfirst/whosonfirst.js b/resources/whosonfirst/whosonfirst.js index b0b9983e..56539f44 100644 --- a/resources/whosonfirst/whosonfirst.js +++ b/resources/whosonfirst/whosonfirst.js @@ -44,6 +44,9 @@ function _normalize (cell, options) { if (options && options.lowercase) { value = value.toLowerCase() } + if (options && options.normalizer && typeof options.normalizer === 'function') { + value = options.normalizer(value) + } return value } diff --git a/test/address.fra.test.js b/test/address.fra.test.js index b7648971..52af9166 100644 --- a/test/address.fra.test.js +++ b/test/address.fra.test.js @@ -63,6 +63,22 @@ const testcase = (test, common) => { assert('Rue Jean Baptiste Clément', [ { street: 'Rue Jean Baptiste Clément' } ], true) + + assert('Mery Sur Oise', [ + { locality: 'Mery Sur Oise' } + ], true) + + assert('Méry Sur Oise', [ + { locality: 'Méry Sur Oise' } + ], true) + + assert('Méry-Sur-Oise', [ + { locality: 'Méry-Sur-Oise' } + ], true) + + assert('Mery-Sur-Oise', [ + { locality: 'Mery-Sur-Oise' } + ], true) } module.exports.all = (tape, common) => { diff --git a/tokenization/normalizer.js b/tokenization/normalizer.js new file mode 100644 index 00000000..137bea25 --- /dev/null +++ b/tokenization/normalizer.js @@ -0,0 +1,22 @@ +const removeAccents = require('remove-accents') + +function normalizer (options = {}) { + return (value) => { + value = value.trim() + if (options.lowercase) { + value = value.toLowerCase() + } + if (options.removeAccents) { + value = removeAccents(value) + } + if (options.removeHyphen) { + value = value.replace(/-/g, ' ') + } + if (options.removeSpaces) { + value = value.replace(/ /g, '') + } + return value + } +} + +module.exports = normalizer diff --git a/tokenization/normalizer.test.js b/tokenization/normalizer.test.js new file mode 100644 index 00000000..9de2416b --- /dev/null +++ b/tokenization/normalizer.test.js @@ -0,0 +1,68 @@ +const normalizer = require('./normalizer') + +module.exports.tests = {} + +module.exports.tests.normalizer = (test) => { + test('normalizerr: hyphen', (t) => { + const value = ' Value-With-Some-Hyphen ' + const expected = 'Value With Some Hyphen' + const normalize = normalizer({ removeHyphen: true }) + + t.deepEquals(normalize(value), expected) + t.end() + }) + + test('normalizer: accents', (t) => { + const value = ' Vâlüé-Wìth-Sômê-Accents ' + const expected = 'Value-With-Some-Accents' + const normalize = normalizer({ removeAccents: true }) + + t.deepEquals(normalize(value), expected) + t.end() + }) + + test('normalizer: lowercase', (t) => { + const value = 'Value-With-Some-UpperCases' + const expected = 'value-with-some-uppercases' + const normalize = normalizer({ lowercase: true }) + + t.deepEquals(normalize(value), expected) + t.end() + }) + + test('normalizer: spaces', (t) => { + const value = 'Value With Some Spaces' + const expected = 'ValueWithSomeSpaces' + const normalize = normalizer({ removeSpaces: true }) + + t.deepEquals(normalize(value), expected) + t.end() + }) + + test('normalizer: option mix', (t) => { + const value = 'Vâlüé-Mìxèd' + const expected = 'value mixed' + const normalize = normalizer({ lowercase: true, removeHyphen: true, removeAccents: true }) + + t.deepEquals(normalize(value), expected) + t.end() + }) + + test('normalizer: no options', (t) => { + const value = 'Value-With-Some-Hyphen' + const normalize = normalizer() + + t.deepEquals(normalize(value), value) + t.end() + }) +} + +module.exports.all = (tape, common) => { + function test (name, testFunction) { + return tape(`normalizer: ${name}`, testFunction) + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common) + } +}