Skip to content

Commit

Permalink
feat: Supports for hyphen as alternative spans (#56)
Browse files Browse the repository at this point in the history
## Background

Sometime, parsing fails when words are not well split. Hyphens' main purpose is to glue words together. That meas, when an hyphen is used, we can process it like a simple space in order to have two separate words.

Only processing hyphens like spaces can unfortunately not be the final solution because the hyphen is also useful in some other cases.

That's why I suggest to take advantage of our graphs and add some alternative ways to complete a phrase without hyphens.

## How it works ?

When we split all sections, we do a first compute on spaces only (like before) and then a second compute on hyphen.

Example for `10 Boulevard Saint-Germain Paris`, when we split this section, we get this: `10`, `Boulevard`, `Saint-Germain`, `Paris`, here is the graph:

![step1](https://user-images.githubusercontent.com/5153882/63770799-3472b500-c8d6-11e9-8ffd-953af4b0f59e.png)

With the hyphen step, we will have `10`, `Boulevard`, `Saint-Germain`, `Paris`, `Saint`, `Germain`

![step2](https://user-images.githubusercontent.com/5153882/63770925-83204f00-c8d6-11e9-94c6-357f8aa48b06.png)

Thanks to this, we will be able to parse phrases such as :
- `10 Boulevard Saint-Germain Paris`: which is `housenumber` + `street` (first solution without this PR  👎)
- `10 Boulevard Saint-Germains Paris`: which is `housenumber` + `street` + `locality` (first solution with this PR 👍)
  • Loading branch information
Joxit authored Sep 15, 2019
1 parent d5126ca commit b643e0b
Show file tree
Hide file tree
Showing 12 changed files with 242 additions and 62 deletions.
4 changes: 2 additions & 2 deletions classifier/CompositeClassifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ class CompositeClassifier extends SectionClassifier {
let prev = c[i - 1]

// enforce adjacency
if (next && curr.graph.findOne('child:last').graph.findOne('next') !== next.graph.findOne('child:first')) {
if (next && !curr.graph.findOne('child:last').graph.some('next', s => s === next.graph.findOne('child:first'))) {
return false
} else if (prev && curr.graph.findOne('child:first').graph.findOne('prev') !== prev.graph.findOne('child:last')) {
} else if (prev && !curr.graph.findOne('child:first').graph.some('prev', s => s === prev.graph.findOne('child:last'))) {
return false
}

Expand Down
16 changes: 9 additions & 7 deletions classifier/TokenPositionClassifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,21 @@ class TokenPositionClassifier extends BaseClassifier {
let firstSection = tokenizer.section[0]
let firstSectionChildren = firstSection.graph.findAll('child')
if (firstSectionChildren.length > 0) {
let firstChild = firstSectionChildren[0]
firstChild.classify(new StartTokenClassification(1.0))
firstSectionChildren.filter(s => !s.graph.findOne('prev')).forEach(firstChild => {
firstChild.classify(new StartTokenClassification(1.0))
})
}

// end token
let lastSection = tokenizer.section[tokenizer.section.length - 1]
let lastSectionChildren = lastSection.graph.findAll('child')
if (lastSectionChildren.length > 0) {
let lastChild = lastSectionChildren[lastSectionChildren.length - 1]
lastChild.classify(new EndTokenClassification(1.0))
if (lastChild.norm.length === 1) {
lastChild.classify(new EndTokenSingleCharacterClassification(1.0))
}
lastSectionChildren.filter(s => !s.graph.findOne('next')).forEach(lastChild => {
lastChild.classify(new EndTokenClassification(1.0))
if (lastChild.norm.length === 1) {
lastChild.classify(new EndTokenSingleCharacterClassification(1.0))
}
})
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion classifier/scheme/street.js
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ module.exports = [
},
{
// Rue Saint Anne
confidence: 0.81,
confidence: 0.91,
Class: StreetClassification,
scheme: [
{
Expand Down
4 changes: 4 additions & 0 deletions test/address.fra.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ const testcase = (test, common) => {
assert(`Rue de l'Adjudant Réau Paris`, [
{ street: `Rue de l'Adjudant Réau` }, { locality: 'Paris' }
])

assert(`10 Boulevard Saint-Germains Paris`, [
{ housenumber: '10' }, { street: `Boulevard Saint-Germains` }, { locality: 'Paris' }
])
}

module.exports.all = (tape, common) => {
Expand Down
9 changes: 9 additions & 0 deletions tokenization/Span.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,12 @@ class Span {
}

module.exports = Span
module.exports.connectSiblings = (...spans) => {
// Supports both var-args and Array as argument
if (spans[0] instanceof Array) { spans = spans[0] }
spans.forEach((span, i) => {
if (spans[i - 1]) { span.graph.add('prev', spans[i - 1]) }
if (spans[i + 1]) { span.graph.add('next', spans[i + 1]) }
})
return spans
}
27 changes: 27 additions & 0 deletions tokenization/Span.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,33 @@ module.exports.tests.setPhrases = (test) => {
})
}

module.exports.tests.connectSiblings = (test) => {
test('connectSiblings - array list', (t) => {
let spans = [new Span('A'), new Span('B'), new Span('C')]
Span.connectSiblings(spans)
t.deepEquals(spans[0].graph.findOne('next'), spans[1])
t.notOk(spans[0].graph.findOne('prev'))
t.deepEquals(spans[1].graph.findOne('next'), spans[2])
t.deepEquals(spans[1].graph.findOne('prev'), spans[0])
t.notOk(spans[2].graph.findOne('next'))
t.deepEquals(spans[2].graph.findOne('prev'), spans[1])
t.end()
})
test('connectSiblings - list of items', (t) => {
let a = new Span('A')
let b = new Span('B')
let c = new Span('C')
Span.connectSiblings(a, b, c)
t.deepEquals(a.graph.findOne('next'), b)
t.notOk(a.graph.findOne('prev'))
t.deepEquals(b.graph.findOne('next'), c)
t.deepEquals(b.graph.findOne('prev'), a)
t.notOk(c.graph.findOne('next'))
t.deepEquals(c.graph.findOne('prev'), b)
t.end()
})
}

module.exports.all = (tape, common) => {
function test (name, testFunction) {
return tape(`Span: ${name}`, testFunction)
Expand Down
10 changes: 7 additions & 3 deletions tokenization/Tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Tokenizer {
split () {
for (let i = 0; i < this.section.length; i++) {
this.section[i].setChildren(split(this.section[i], funcs.fieldsFuncWhiteSpace))
this.section[i].setChildren(split(this.section[i], funcs.fieldsFuncHyphenOrWhiteSpace))
}
}

Expand All @@ -31,12 +32,15 @@ class Tokenizer {
}
}

computeCoverageRec (sum, curr) {
if (!curr) { return sum }
return this.computeCoverageRec(sum + curr.end - curr.start, curr.graph.findOne('next'))
}

computeCoverage () {
this.coverage = 0
this.section.forEach(s => {
this.coverage += s.graph.findAll('child').reduce(
(sum, cur) => sum + cur.end - cur.start, 0
)
this.coverage += this.computeCoverageRec(0, s.graph.findOne('child'))
}, this)
}
}
Expand Down
16 changes: 16 additions & 0 deletions tokenization/Tokenizer.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,22 @@ module.exports.tests.split = (test) => {
t.equals(tok.section[3].graph.findAll('child')[0].body, 'USA')
t.end()
})
test('split: hyphen', (t) => {
let tok = new Tokenizer('20 Boulevard Saint-Germain, Paris, France')
t.true(tok.section.every(s => s.graph.findAll('child').every(c => c.constructor.name === 'Span')))
t.equals(tok.section.length, 3)
t.equals(tok.section[0].graph.findAll('child').length, 5)
t.equals(tok.section[0].graph.findAll('child')[0].body, '20')
t.equals(tok.section[0].graph.findAll('child')[1].body, 'Boulevard')
t.equals(tok.section[0].graph.findAll('child')[2].body, 'Saint-Germain')
t.equals(tok.section[0].graph.findAll('child')[3].body, 'Saint')
t.equals(tok.section[0].graph.findAll('child')[4].body, 'Germain')
t.equals(tok.section[1].graph.findAll('child').length, 1)
t.equals(tok.section[1].graph.findAll('child')[0].body, 'Paris')
t.equals(tok.section[2].graph.findAll('child').length, 1)
t.equals(tok.section[2].graph.findAll('child')[0].body, 'France')
t.end()
})
}

module.exports.tests.permute = (test) => {
Expand Down
86 changes: 51 additions & 35 deletions tokenization/permutate.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,47 +11,63 @@ const JOIN_CHAR = ' '
ported: https://github.com/pelias/placeholder/blob/master/lib/permutations.js
**/

function permutate (spans, windowMin, windowMax) {
let permutations = []
function permutateRec (prevSpan, s, windowCur, windowMin, windowMax, permutations) {
// Stops when the window is reached
if (windowCur > windowMax) {
return
}
// Create new span base on the previous and the next one
let span = new Span(prevSpan.body + (prevSpan.body.length > 0 ? JOIN_CHAR : '') + s.body, prevSpan.start)
// Add all children from the previous span to the new one, they will have the same ones + the next one
// Add to all children from the previous span the new span as parent + the next one
prevSpan.graph.findAll('child').forEach(child => {
span.graph.add('child', child)
child.graph.add('parent', span)
})
span.graph.add('child', s)
s.graph.add('parent', span)

// favour larger tokens over shorter ones
for (let i = 0; i < spans.length; i++) {
for (let j = i + windowMax; j >= i + windowMin; j--) {
if (j <= spans.length) {
if (j > i) {
let span = new Span()
for (let k = i; k < j; k++) {
let s = spans[k]
span.setBody(span.body += s.body)
span.graph.add('child', s)
if (k === i) { span.graph.add('child:first', s) }
if (k === j - 1) { span.graph.add('child:last', s) }
s.graph.add('parent', span)
let isFirst = span.body === s.body
let isLast = !s.graph.findOne('next')

// join with delim
if (k < j - 1) {
span.body += JOIN_CHAR
}
// If span is the first one, s is the first child, otherwise we take the first child of the previous span
if (isFirst) {
span.graph.add('child:first', s)
} else {
span.graph.add('child:first', prevSpan.graph.findOne('child:first'))
}

// update spans
if (i === k) {
span.start = s.start
span.end = s.end
} else {
if (s.start < span.start) {
span.start = s.start
}
if (s.end > span.end) {
span.end = s.end
}
}
}
permutations.push(span)
}
}
span.graph.add('child:last', s)

if (isFirst) {
span.start = s.start
span.end = s.end
} else {
if (s.start < span.start) {
span.start = s.start
}
if (s.end > span.end) {
span.end = s.end
}
}

// go through the graph recursively, check all next spans
if (!isLast) {
s.graph.findAll('next').forEach(next => {
permutateRec(span, next, windowCur + 1, windowMin, windowMax, permutations)
})
}

if (windowMin <= windowCur) {
permutations.push(span)
}
}

function permutate (spans, windowMin, windowMax) {
let permutations = []
spans.forEach(span => {
permutateRec(new Span(), span, 1, windowMin, windowMax, permutations)
})
return permutations
}

Expand Down
Loading

0 comments on commit b643e0b

Please sign in to comment.