diff --git a/classification/ToponymClassification.js b/classification/ToponymClassification.js new file mode 100644 index 00000000..b37854bb --- /dev/null +++ b/classification/ToponymClassification.js @@ -0,0 +1,10 @@ +const Classification = require('./Classification') + +class ToponymClassification extends Classification { + constructor (confidence, meta) { + super(confidence, meta) + this.label = 'toponym' + } +} + +module.exports = ToponymClassification diff --git a/classification/ToponymClassification.test.js b/classification/ToponymClassification.test.js new file mode 100644 index 00000000..2fd5bd4a --- /dev/null +++ b/classification/ToponymClassification.test.js @@ -0,0 +1,24 @@ +const Classification = require('./ToponymClassification') + +module.exports.tests = {} + +module.exports.tests.constructor = (test) => { + test('constructor', (t) => { + let c = new Classification() + t.false(c.public) + t.equals(c.label, 'toponym') + t.equals(c.confidence, 1.0) + t.deepEqual(c.meta, {}) + t.end() + }) +} + +module.exports.all = (tape, common) => { + function test (name, testFunction) { + return tape(`ToponymClassification: ${name}`, testFunction) + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common) + } +} diff --git a/classifier/ToponymClassifier.js b/classifier/ToponymClassifier.js new file mode 100644 index 00000000..4f280ab5 --- /dev/null +++ b/classifier/ToponymClassifier.js @@ -0,0 +1,26 @@ +const WordClassifier = require('./super/WordClassifier') +const ToponymClassification = require('../classification/ToponymClassification') +const libpostal = require('../resources/libpostal/libpostal') + +// dictionaries sourced from the libpostal project +// see: https://github.com/openvenues/libpostal + +class ToponymClassifier extends WordClassifier { + setup () { + // load street tokens + this.index = {} + libpostal.load(this.index, ['en'], 'toponyms.txt') + } + + each (span) { + // skip spans which contain numbers + if (span.contains.numerals) { return } + + // use an inverted index for full token matching as it's O(1) + if (this.index.hasOwnProperty(span.norm)) { + span.classify(new ToponymClassification(1)) + } + } +} + +module.exports = ToponymClassifier diff --git a/classifier/ToponymClassifier.test.js b/classifier/ToponymClassifier.test.js new file mode 100644 index 00000000..1345dd0e --- /dev/null +++ b/classifier/ToponymClassifier.test.js @@ -0,0 +1,57 @@ +const ToponymClassifier = require('./ToponymClassifier') +const ToponymClassification = require('../classification/ToponymClassification') +const Span = require('../tokenization/Span') +const classifier = new ToponymClassifier() + +module.exports.tests = {} + +function classify (body) { + let s = new Span(body) + classifier.each(s, null, 1) + return s +} + +module.exports.tests.contains_numerals = (test) => { + test('contains numerals: honours contains.numerals boolean', (t) => { + let s = new Span('example') + s.contains.numerals = true + classifier.each(s, null, 1) + t.deepEqual(s.classifications, {}) + t.end() + }) +} + +module.exports.tests.single_character_tokens = (test) => { + test('index: does not contain single char tokens', (t) => { + t.false(Object.keys(classifier.index).some(token => token.length < 2)) + t.end() + }) +} + +module.exports.tests.english_suffix = (test) => { + let valid = [ + 'md', 'maryland', 'ca', + 'california', 'ia', 'nj' + ] + + valid.forEach(token => { + test(`english toponyms: ${token}`, (t) => { + let s = classify(token) + + t.deepEqual(s.classifications, { + ToponymClassification: new ToponymClassification(1) + }) + t.end() + }) + }) +} + +module.exports.all = (tape, common) => { + function test (name, testFunction) { + return tape(`ToponymClassifier: ${name}`, testFunction) + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common) + } +} diff --git a/classifier/scheme/street.js b/classifier/scheme/street.js index fdf78356..88e65ba1 100644 --- a/classifier/scheme/street.js +++ b/classifier/scheme/street.js @@ -279,7 +279,7 @@ module.exports = [ Class: StreetClassification, scheme: [ { - is: ['RoadTypeClassification'], + is: ['RoadTypeClassification', 'ToponymClassification'], not: [] }, { diff --git a/parser/AddressParser.js b/parser/AddressParser.js index d0706317..a73df405 100644 --- a/parser/AddressParser.js +++ b/parser/AddressParser.js @@ -6,6 +6,7 @@ const PostcodeClassifier = require('../classifier/PostcodeClassifier') const StreetPrefixClassifier = require('../classifier/StreetPrefixClassifier') const StreetSuffixClassifier = require('../classifier/StreetSuffixClassifier') const RoadTypeClassifier = require('../classifier/RoadTypeClassifier') +const ToponymClassifier = require('../classifier/ToponymClassifier') const CompoundStreetClassifier = require('../classifier/CompoundStreetClassifier') const DirectionalClassifier = require('../classifier/DirectionalClassifier') const OrdinalClassifier = require('../classifier/OrdinalClassifier') @@ -46,6 +47,7 @@ class AddressParser extends Parser { new StreetPrefixClassifier(), new StreetSuffixClassifier(), new RoadTypeClassifier(), + new ToponymClassifier(), new CompoundStreetClassifier(), new DirectionalClassifier(), new OrdinalClassifier(), diff --git a/test/address.usa.test.js b/test/address.usa.test.js index 0c34be02..a26bd051 100644 --- a/test/address.usa.test.js +++ b/test/address.usa.test.js @@ -63,6 +63,13 @@ const testcase = (test, common) => { assert('1210a Highway 10 W IA', [{ housenumber: '1210a' }, { street: 'Highway 10 W' }, { region: 'IA' }], true) assert('1210a State Highway 10', [{ housenumber: '1210a' }, { street: 'State Highway 10' }], true) assert('1389a County Road 42 IA', [{ housenumber: '1389a' }, { street: 'County Road 42' }, { region: 'IA' }], true) + assert('CA 72', [{ street: 'CA 72' }], true) + assert('1210a IA 10 W IA', [{ housenumber: '1210a' }, { street: 'IA 10 W' }, { region: 'IA' }], true) + assert('1210a California 10', [{ housenumber: '1210a' }, { street: 'California 10' }], true) + assert('1389a IA 42 IA', [{ housenumber: '1389a' }, { street: 'IA 42' }, { region: 'IA' }], true) + + // This does not work because of MD + // assert('1111 MD 760, Lusby, MD, USA', [{ housenumber: '1111' }, { street: 'MD 760' }, { locality: 'Lusby' }, { region: 'MD' }, { country: 'USA' }], true) } module.exports.all = (tape, common) => {