Skip to content

Commit

Permalink
feat(normalizer): Add normalizer for WOF which can be configured to r…
Browse files Browse the repository at this point in the history
…emove accents/hyphen/spaces and do lowercase (#33)
  • Loading branch information
Joxit authored and missinglink committed Jun 5, 2019
1 parent 7f05b8f commit 94c9b4a
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 3 deletions.
7 changes: 5 additions & 2 deletions classifier/WhosOnFirstClassifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const CountryClassification = require('../classification/CountryClassification')
const RegionClassification = require('../classification/RegionClassification')
const LocalityClassification = require('../classification/LocalityClassification')
const whosonfirst = require('../resources/whosonfirst/whosonfirst')
const normalize = require('../tokenization/normalizer')({ lowercase: true, removeHyphen: true, removeAccents: true })

// databases sourced from the WhosOnFirst project
// see: https://whosonfirst.org
Expand Down Expand Up @@ -35,7 +36,8 @@ class WhosOnFirstClassifier extends PhraseClassifier {
Object.keys(placetypes).forEach(placetype => {
this.tokens[placetype] = new Set()
whosonfirst.load(this.tokens[placetype], [placetype], placetypes[placetype].files, {
minlength: 2
minlength: 2,
normalizer: normalize
})

// general blacklist
Expand Down Expand Up @@ -102,8 +104,9 @@ class WhosOnFirstClassifier extends PhraseClassifier {
return
}

const normalizedSpan = normalize(span.norm)
Object.keys(placetypes).forEach(placetype => {
if (this.tokens[placetype].has(span.norm)) {
if (this.tokens[placetype].has(normalizedSpan)) {
// do not classify tokens if they already have a 'StopWordClassification'
if (
span.classifications.hasOwnProperty('StopWordClassification') || (
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
},
"dependencies": {
"cluster": "^0.7.7",
"express": "^4.16.4"
"express": "^4.16.4",
"remove-accents": "^0.4.2"
},
"devDependencies": {
"better-sqlite3": "^5.4.0",
Expand Down
3 changes: 3 additions & 0 deletions resources/whosonfirst/whosonfirst.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ function _normalize (cell, options) {
if (options && options.lowercase) {
value = value.toLowerCase()
}
if (options && options.normalizer && typeof options.normalizer === 'function') {
value = options.normalizer(value)
}
return value
}

Expand Down
16 changes: 16 additions & 0 deletions test/address.fra.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,22 @@ const testcase = (test, common) => {
assert('Rue Jean Baptiste Clément', [
{ street: 'Rue Jean Baptiste Clément' }
], true)

assert('Mery Sur Oise', [
{ locality: 'Mery Sur Oise' }
], true)

assert('Méry Sur Oise', [
{ locality: 'Méry Sur Oise' }
], true)

assert('Méry-Sur-Oise', [
{ locality: 'Méry-Sur-Oise' }
], true)

assert('Mery-Sur-Oise', [
{ locality: 'Mery-Sur-Oise' }
], true)
}

module.exports.all = (tape, common) => {
Expand Down
22 changes: 22 additions & 0 deletions tokenization/normalizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
const removeAccents = require('remove-accents')

function normalizer (options = {}) {
return (value) => {
value = value.trim()
if (options.lowercase) {
value = value.toLowerCase()
}
if (options.removeAccents) {
value = removeAccents(value)
}
if (options.removeHyphen) {
value = value.replace(/-/g, ' ')
}
if (options.removeSpaces) {
value = value.replace(/ /g, '')
}
return value
}
}

module.exports = normalizer
68 changes: 68 additions & 0 deletions tokenization/normalizer.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
const normalizer = require('./normalizer')

module.exports.tests = {}

module.exports.tests.normalizer = (test) => {
test('normalizerr: hyphen', (t) => {
const value = ' Value-With-Some-Hyphen '
const expected = 'Value With Some Hyphen'
const normalize = normalizer({ removeHyphen: true })

t.deepEquals(normalize(value), expected)
t.end()
})

test('normalizer: accents', (t) => {
const value = ' Vâlüé-Wìth-Sômê-Accents '
const expected = 'Value-With-Some-Accents'
const normalize = normalizer({ removeAccents: true })

t.deepEquals(normalize(value), expected)
t.end()
})

test('normalizer: lowercase', (t) => {
const value = 'Value-With-Some-UpperCases'
const expected = 'value-with-some-uppercases'
const normalize = normalizer({ lowercase: true })

t.deepEquals(normalize(value), expected)
t.end()
})

test('normalizer: spaces', (t) => {
const value = 'Value With Some Spaces'
const expected = 'ValueWithSomeSpaces'
const normalize = normalizer({ removeSpaces: true })

t.deepEquals(normalize(value), expected)
t.end()
})

test('normalizer: option mix', (t) => {
const value = 'Vâlüé-Mìxèd'
const expected = 'value mixed'
const normalize = normalizer({ lowercase: true, removeHyphen: true, removeAccents: true })

t.deepEquals(normalize(value), expected)
t.end()
})

test('normalizer: no options', (t) => {
const value = 'Value-With-Some-Hyphen'
const normalize = normalizer()

t.deepEquals(normalize(value), value)
t.end()
})
}

module.exports.all = (tape, common) => {
function test (name, testFunction) {
return tape(`normalizer: ${name}`, testFunction)
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test, common)
}
}

0 comments on commit 94c9b4a

Please sign in to comment.