From 94c9b4a6afc64b73884636e86c0e86471e64c59f Mon Sep 17 00:00:00 2001
From: Jones Magloire <Joxit@users.noreply.github.com>
Date: Wed, 5 Jun 2019 14:16:25 +0200
Subject: [PATCH] feat(normalizer): Add normalizer for WOF which can be
 configured to remove accents/hyphen/spaces and do lowercase (#33)

---
 classifier/WhosOnFirstClassifier.js  |  7 ++-
 package.json                         |  3 +-
 resources/whosonfirst/whosonfirst.js |  3 ++
 test/address.fra.test.js             | 16 +++++++
 tokenization/normalizer.js           | 22 +++++++++
 tokenization/normalizer.test.js      | 68 ++++++++++++++++++++++++++++
 6 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 tokenization/normalizer.js
 create mode 100644 tokenization/normalizer.test.js

diff --git a/classifier/WhosOnFirstClassifier.js b/classifier/WhosOnFirstClassifier.js
index dfe1e14c..06086f4d 100644
--- a/classifier/WhosOnFirstClassifier.js
+++ b/classifier/WhosOnFirstClassifier.js
@@ -5,6 +5,7 @@ const CountryClassification = require('../classification/CountryClassification')
 const RegionClassification = require('../classification/RegionClassification')
 const LocalityClassification = require('../classification/LocalityClassification')
 const whosonfirst = require('../resources/whosonfirst/whosonfirst')
+const normalize = require('../tokenization/normalizer')({ lowercase: true, removeHyphen: true, removeAccents: true })
 
 // databases sourced from the WhosOnFirst project
 // see: https://whosonfirst.org
@@ -35,7 +36,8 @@ class WhosOnFirstClassifier extends PhraseClassifier {
     Object.keys(placetypes).forEach(placetype => {
       this.tokens[placetype] = new Set()
       whosonfirst.load(this.tokens[placetype], [placetype], placetypes[placetype].files, {
-        minlength: 2
+        minlength: 2,
+        normalizer: normalize
       })
 
       // general blacklist
@@ -102,8 +104,9 @@ class WhosOnFirstClassifier extends PhraseClassifier {
       return
     }
 
+    const normalizedSpan = normalize(span.norm)
     Object.keys(placetypes).forEach(placetype => {
-      if (this.tokens[placetype].has(span.norm)) {
+      if (this.tokens[placetype].has(normalizedSpan)) {
         // do not classify tokens if they already have a 'StopWordClassification'
         if (
           span.classifications.hasOwnProperty('StopWordClassification') || (
diff --git a/package.json b/package.json
index 21f34935..cdc8f58f 100644
--- a/package.json
+++ b/package.json
@@ -33,7 +33,8 @@
   },
   "dependencies": {
     "cluster": "^0.7.7",
-    "express": "^4.16.4"
+    "express": "^4.16.4",
+    "remove-accents": "^0.4.2"
   },
   "devDependencies": {
     "better-sqlite3": "^5.4.0",
diff --git a/resources/whosonfirst/whosonfirst.js b/resources/whosonfirst/whosonfirst.js
index b0b9983e..56539f44 100644
--- a/resources/whosonfirst/whosonfirst.js
+++ b/resources/whosonfirst/whosonfirst.js
@@ -44,6 +44,9 @@ function _normalize (cell, options) {
   if (options && options.lowercase) {
     value = value.toLowerCase()
   }
+  if (options && options.normalizer && typeof options.normalizer === 'function') {
+    value = options.normalizer(value)
+  }
   return value
 }
 
diff --git a/test/address.fra.test.js b/test/address.fra.test.js
index b7648971..52af9166 100644
--- a/test/address.fra.test.js
+++ b/test/address.fra.test.js
@@ -63,6 +63,22 @@ const testcase = (test, common) => {
   assert('Rue Jean Baptiste Clément', [
     { street: 'Rue Jean Baptiste Clément' }
   ], true)
+
+  assert('Mery Sur Oise', [
+    { locality: 'Mery Sur Oise' }
+  ], true)
+
+  assert('Méry Sur Oise', [
+    { locality: 'Méry Sur Oise' }
+  ], true)
+
+  assert('Méry-Sur-Oise', [
+    { locality: 'Méry-Sur-Oise' }
+  ], true)
+
+  assert('Mery-Sur-Oise', [
+    { locality: 'Mery-Sur-Oise' }
+  ], true)
 }
 
 module.exports.all = (tape, common) => {
diff --git a/tokenization/normalizer.js b/tokenization/normalizer.js
new file mode 100644
index 00000000..137bea25
--- /dev/null
+++ b/tokenization/normalizer.js
@@ -0,0 +1,22 @@
+const removeAccents = require('remove-accents')
+
+function normalizer (options = {}) {
+  return (value) => {
+    value = value.trim()
+    if (options.lowercase) {
+      value = value.toLowerCase()
+    }
+    if (options.removeAccents) {
+      value = removeAccents(value)
+    }
+    if (options.removeHyphen) {
+      value = value.replace(/-/g, ' ')
+    }
+    if (options.removeSpaces) {
+      value = value.replace(/ /g, '')
+    }
+    return value
+  }
+}
+
+module.exports = normalizer
diff --git a/tokenization/normalizer.test.js b/tokenization/normalizer.test.js
new file mode 100644
index 00000000..9de2416b
--- /dev/null
+++ b/tokenization/normalizer.test.js
@@ -0,0 +1,68 @@
+const normalizer = require('./normalizer')
+
+module.exports.tests = {}
+
+module.exports.tests.normalizer = (test) => {
+  test('normalizerr: hyphen', (t) => {
+    const value = ' Value-With-Some-Hyphen '
+    const expected = 'Value With Some Hyphen'
+    const normalize = normalizer({ removeHyphen: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: accents', (t) => {
+    const value = ' Vâlüé-Wìth-Sômê-Accents '
+    const expected = 'Value-With-Some-Accents'
+    const normalize = normalizer({ removeAccents: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: lowercase', (t) => {
+    const value = 'Value-With-Some-UpperCases'
+    const expected = 'value-with-some-uppercases'
+    const normalize = normalizer({ lowercase: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: spaces', (t) => {
+    const value = 'Value With Some Spaces'
+    const expected = 'ValueWithSomeSpaces'
+    const normalize = normalizer({ removeSpaces: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: option mix', (t) => {
+    const value = 'Vâlüé-Mìxèd'
+    const expected = 'value mixed'
+    const normalize = normalizer({ lowercase: true, removeHyphen: true, removeAccents: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: no options', (t) => {
+    const value = 'Value-With-Some-Hyphen'
+    const normalize = normalizer()
+
+    t.deepEquals(normalize(value), value)
+    t.end()
+  })
+}
+
+module.exports.all = (tape, common) => {
+  function test (name, testFunction) {
+    return tape(`normalizer: ${name}`, testFunction)
+  }
+
+  for (var testCase in module.exports.tests) {
+    module.exports.tests[testCase](test, common)
+  }
+}