From e77bebbc03a9dfd4306293bb50a898cce483724e Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Thu, 6 Jun 2019 15:24:12 +0200 Subject: [PATCH] feat: stricter_postcodes (#42) * feat(stricter_postcodes): enforce stricter postcode parsing * feat(stricter_postcodes): allow postcode in start position if it is the only token in its section --- classifier/PostcodeClassifier.js | 20 +++++++++++++++----- test/address.aus.test.js | 19 +++++++++++++++++++ test/address.usa.test.js | 9 +++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 test/address.aus.test.js diff --git a/classifier/PostcodeClassifier.js b/classifier/PostcodeClassifier.js index 3a1a12eb..bf6e6192 100644 --- a/classifier/PostcodeClassifier.js +++ b/classifier/PostcodeClassifier.js @@ -1,4 +1,4 @@ -const fs = require('fs') +// const fs = require('fs') const path = require('path') const WordClassifier = require('./super/WordClassifier') const PostcodeClassification = require('../classification/PostcodeClassification') @@ -7,14 +7,15 @@ const dictPath = path.join(__dirname, `../resources/chromium-i18n/ssl-address`) // postcode data sourced from google-i18n project // see: https://chromium-i18n.appspot.com/ssl-address // note: reducing the list of country codes will have a performance benefit -const countryCodes = fs.readdirSync(dictPath) - .filter(p => p.endsWith('.json')) - .map(p => p.split('.')[0]) +// const countryCodes = fs.readdirSync(dictPath) +// .filter(p => p.endsWith('.json')) +// .map(p => p.split('.')[0]) +const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru'] class PostcodeClassifier extends WordClassifier { setup () { this.data = countryCodes.map(cc => { - let row = require(path.join(dictPath, `${cc}.json`)) + let row = require(path.join(dictPath, `${cc.toUpperCase()}.json`)) row.regex = new RegExp('^(' + row.zip + ')$', 'i') return row }).filter(row => !row.regex.test('100')) // remove countries with 3-digit postcodes @@ -25,6 +26,15 @@ class PostcodeClassifier extends WordClassifier { // @todo: is this correct globally? if (!span.contains.numerals) { return } + // do not allow postcode in the start position unless it is the + // only token present in its section + if ( + span.classifications.hasOwnProperty('StartTokenClassification') && + (span.graph.length('prev') > 0 || span.graph.length('next') > 0) + ) { + return + } + for (let i = 0; i < this.data.length; i++) { if (this.data[i].regex.test(span.norm)) { span.classify(new PostcodeClassification(1)) diff --git a/test/address.aus.test.js b/test/address.aus.test.js new file mode 100644 index 00000000..5b0d2a29 --- /dev/null +++ b/test/address.aus.test.js @@ -0,0 +1,19 @@ +const AddressParser = require('../parser/AddressParser') + +const testcase = (test, common) => { + let parser = new AddressParser() + let assert = common.assert.bind(null, test, parser) + + assert('6000, NSW, Australia', [ + { postcode: '6000' }, + { region: 'NSW' }, { country: 'Australia' } + ], true) +} + +module.exports.all = (tape, common) => { + function test (name, testFunction) { + return tape(`address AUS: ${name}`, testFunction) + } + + testcase(test, common) +} diff --git a/test/address.usa.test.js b/test/address.usa.test.js index 9b55be0b..aa41ed1d 100644 --- a/test/address.usa.test.js +++ b/test/address.usa.test.js @@ -21,6 +21,15 @@ const testcase = (test, common) => { { housenumber: '1900' }, { street: 'SE F ST' }, { locality: 'SAN FRANCISCO' } ], true) + + // postcode allowed in first position when only 1 token + assert('90210', [{ postcode: '90210' }], true) + + // postcode allowed in first position when only 1 token in section + assert('90210, CA', [{ postcode: '90210' }, { region: 'CA' }], true) + + // postcode not allowed in first position otherwise + assert('90210 Foo', []) } module.exports.all = (tape, common) => {