feat: stricter_postcodes (#42)

* feat(stricter_postcodes): enforce stricter postcode parsing * feat(stricter_postcodes): allow postcode in start position if it is the only token in its section
pelias · Jun 6, 2019 · e77bebb · e77bebb
1 parent 34ddc92
commit e77bebb
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 5 deletions.
diff --git a/classifier/PostcodeClassifier.js b/classifier/PostcodeClassifier.js
@@ -1,4 +1,4 @@
-const fs = require('fs')
+// const fs = require('fs')
 const path = require('path')
 const WordClassifier = require('./super/WordClassifier')
 const PostcodeClassification = require('../classification/PostcodeClassification')
@@ -7,14 +7,15 @@ const dictPath = path.join(__dirname, `../resources/chromium-i18n/ssl-address`)
 // postcode data sourced from google-i18n project
 // see: https://chromium-i18n.appspot.com/ssl-address
 // note: reducing the list of country codes will have a performance benefit
-const countryCodes = fs.readdirSync(dictPath)
-  .filter(p => p.endsWith('.json'))
-  .map(p => p.split('.')[0])
+// const countryCodes = fs.readdirSync(dictPath)
+//   .filter(p => p.endsWith('.json'))
+//   .map(p => p.split('.')[0])
+const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru']
 
 class PostcodeClassifier extends WordClassifier {
   setup () {
     this.data = countryCodes.map(cc => {
-      let row = require(path.join(dictPath, `${cc}.json`))
+      let row = require(path.join(dictPath, `${cc.toUpperCase()}.json`))
       row.regex = new RegExp('^(' + row.zip + ')$', 'i')
       return row
     }).filter(row => !row.regex.test('100')) // remove countries with 3-digit postcodes
@@ -25,6 +26,15 @@ class PostcodeClassifier extends WordClassifier {
     // @todo: is this correct globally?
     if (!span.contains.numerals) { return }
 
+    // do not allow postcode in the start position unless it is the
+    // only token present in its section
+    if (
+      span.classifications.hasOwnProperty('StartTokenClassification') &&
+      (span.graph.length('prev') > 0 || span.graph.length('next') > 0)
+    ) {
+      return
+    }
+
     for (let i = 0; i < this.data.length; i++) {
       if (this.data[i].regex.test(span.norm)) {
         span.classify(new PostcodeClassification(1))

diff --git a/test/address.aus.test.js b/test/address.aus.test.js
@@ -0,0 +1,19 @@
+const AddressParser = require('../parser/AddressParser')
+
+const testcase = (test, common) => {
+  let parser = new AddressParser()
+  let assert = common.assert.bind(null, test, parser)
+
+  assert('6000, NSW, Australia', [
+    { postcode: '6000' },
+    { region: 'NSW' }, { country: 'Australia' }
+  ], true)
+}
+
+module.exports.all = (tape, common) => {
+  function test (name, testFunction) {
+    return tape(`address AUS: ${name}`, testFunction)
+  }
+
+  testcase(test, common)
+}
diff --git a/test/address.usa.test.js b/test/address.usa.test.js
@@ -21,6 +21,15 @@ const testcase = (test, common) => {
     { housenumber: '1900' }, { street: 'SE F ST' },
     { locality: 'SAN FRANCISCO' }
   ], true)
+
+  // postcode allowed in first position when only 1 token
+  assert('90210', [{ postcode: '90210' }], true)
+
+  // postcode allowed in first position when only 1 token in section
+  assert('90210, CA', [{ postcode: '90210' }, { region: 'CA' }], true)
+
+  // postcode not allowed in first position otherwise
+  assert('90210 Foo', [])
 }
 
 module.exports.all = (tape, common) => {