-
-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(token_position): add StartTokenClassification and rename FinalTo…
…kenClassification to EndTokenClassification (#41)
- Loading branch information
1 parent
d7b8242
commit 34ddc92
Showing
14 changed files
with
197 additions
and
135 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
const Classification = require('./Classification') | ||
|
||
class EndTokenClassification extends Classification { | ||
constructor (confidence, meta) { | ||
super(confidence, meta) | ||
this.label = 'end_token' | ||
} | ||
} | ||
|
||
module.exports = EndTokenClassification |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
const Classification = require('./Classification') | ||
|
||
class EndTokenSingleCharacterClassification extends Classification { | ||
constructor (confidence, meta) { | ||
super(confidence, meta) | ||
this.label = 'end_token_single_character' | ||
} | ||
} | ||
|
||
module.exports = EndTokenSingleCharacterClassification |
24 changes: 24 additions & 0 deletions
24
classification/EndTokenSingleCharacterClassification.test.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
const Classification = require('./EndTokenSingleCharacterClassification') | ||
|
||
module.exports.tests = {} | ||
|
||
module.exports.tests.constructor = (test) => { | ||
test('constructor', (t) => { | ||
let c = new Classification() | ||
t.false(c.public) | ||
t.equals(c.label, 'end_token_single_character') | ||
t.equals(c.confidence, 1.0) | ||
t.deepEqual(c.meta, {}) | ||
t.end() | ||
}) | ||
} | ||
|
||
module.exports.all = (tape, common) => { | ||
function test (name, testFunction) { | ||
return tape(`EndTokenSingleCharacterClassification: ${name}`, testFunction) | ||
} | ||
|
||
for (var testCase in module.exports.tests) { | ||
module.exports.tests[testCase](test, common) | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
const Classification = require('./Classification') | ||
|
||
class StartTokenClassification extends Classification { | ||
constructor (confidence, meta) { | ||
super(confidence, meta) | ||
this.label = 'start_token' | ||
} | ||
} | ||
|
||
module.exports = StartTokenClassification |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
const BaseClassifier = require('./super/BaseClassifier') | ||
const EndTokenClassification = require('../classification/EndTokenClassification') | ||
const EndTokenSingleCharacterClassification = require('../classification/EndTokenSingleCharacterClassification') | ||
const StartTokenClassification = require('../classification/StartTokenClassification') | ||
|
||
// classify the final token with 'EndTokenClassification' | ||
// and the first token with 'SartTokenClassification' | ||
// and also a 'EndTokenSingleCharacterClassification' if its only | ||
// a single character in length. | ||
// note: this can be useful for improving autocomplete. | ||
// note: in the case of a single token then the span will be | ||
// classified with more than one classification (can be both start & end). | ||
|
||
class TokenPositionClassifier extends BaseClassifier { | ||
classify (tokenizer) { | ||
if (tokenizer.section.length < 1) { return } | ||
|
||
// start token | ||
let firstSection = tokenizer.section[0] | ||
let firstSectionChildren = firstSection.graph.findAll('child') | ||
if (firstSectionChildren.length > 0) { | ||
let firstChild = firstSectionChildren[0] | ||
firstChild.classify(new StartTokenClassification(1.0)) | ||
} | ||
|
||
// end token | ||
let lastSection = tokenizer.section[tokenizer.section.length - 1] | ||
let lastSectionChildren = lastSection.graph.findAll('child') | ||
if (lastSectionChildren.length > 0) { | ||
let lastChild = lastSectionChildren[lastSectionChildren.length - 1] | ||
lastChild.classify(new EndTokenClassification(1.0)) | ||
if (lastChild.norm.length === 1) { | ||
lastChild.classify(new EndTokenSingleCharacterClassification(1.0)) | ||
} | ||
} | ||
} | ||
} | ||
|
||
module.exports = TokenPositionClassifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
const TokenPositionClassifier = require('./TokenPositionClassifier') | ||
const Tokenizer = require('../tokenization/Tokenizer') | ||
|
||
module.exports.tests = {} | ||
|
||
function classify (body) { | ||
let c = new TokenPositionClassifier() | ||
let t = new Tokenizer(body) | ||
c.classify(t) | ||
|
||
// generate an array containing all the spans | ||
// with a final token classification | ||
let classifications = { | ||
EndTokenClassification: [], | ||
EndTokenSingleCharacterClassification: [], | ||
StartTokenClassification: [] | ||
} | ||
t.section.forEach(s => { | ||
s.graph.findAll('child').forEach(c => { | ||
if (c.classifications.hasOwnProperty('StartTokenClassification')) { | ||
classifications.StartTokenClassification.push(c) | ||
} | ||
if (c.classifications.hasOwnProperty('EndTokenClassification')) { | ||
classifications.EndTokenClassification.push(c) | ||
} | ||
if (c.classifications.hasOwnProperty('EndTokenSingleCharacterClassification')) { | ||
classifications.EndTokenSingleCharacterClassification.push(c) | ||
} | ||
}) | ||
}) | ||
return classifications | ||
} | ||
|
||
module.exports.tests.classify = (test) => { | ||
test('classify: empty string', (t) => { | ||
let c = classify('') | ||
t.equals(c.StartTokenClassification.length, 0) | ||
t.equals(c.EndTokenClassification.length, 0) | ||
t.equals(c.EndTokenSingleCharacterClassification.length, 0) | ||
t.end() | ||
}) | ||
|
||
test('classify: A', (t) => { | ||
let c = classify('A') | ||
t.equals(c.StartTokenClassification.length, 1) | ||
t.equals(c.StartTokenClassification[0].body, 'A') | ||
t.equals(c.EndTokenClassification.length, 1) | ||
t.equals(c.EndTokenClassification[0].body, 'A') | ||
t.equals(c.EndTokenSingleCharacterClassification.length, 1) | ||
t.equals(c.EndTokenSingleCharacterClassification[0].body, 'A') | ||
t.end() | ||
}) | ||
|
||
test('classify: A B', (t) => { | ||
let c = classify('A B') | ||
t.equals(c.StartTokenClassification.length, 1) | ||
t.equals(c.StartTokenClassification[0].body, 'A') | ||
t.equals(c.EndTokenClassification.length, 1) | ||
t.equals(c.EndTokenClassification[0].body, 'B') | ||
t.equals(c.EndTokenSingleCharacterClassification.length, 1) | ||
t.equals(c.EndTokenSingleCharacterClassification[0].body, 'B') | ||
t.end() | ||
}) | ||
|
||
test('classify: A BC', (t) => { | ||
let c = classify('A BC') | ||
t.equals(c.StartTokenClassification.length, 1) | ||
t.equals(c.StartTokenClassification[0].body, 'A') | ||
t.equals(c.EndTokenClassification.length, 1) | ||
t.equals(c.EndTokenClassification[0].body, 'BC') | ||
t.equals(c.EndTokenSingleCharacterClassification.length, 0) | ||
t.end() | ||
}) | ||
|
||
test('classify: A BC, D', (t) => { | ||
let c = classify('A BC, D') | ||
t.equals(c.StartTokenClassification.length, 1) | ||
t.equals(c.StartTokenClassification[0].body, 'A') | ||
t.equals(c.EndTokenClassification.length, 1) | ||
t.equals(c.EndTokenClassification[0].body, 'D') | ||
t.equals(c.EndTokenSingleCharacterClassification.length, 1) | ||
t.equals(c.EndTokenSingleCharacterClassification[0].body, 'D') | ||
t.end() | ||
}) | ||
} | ||
|
||
module.exports.all = (tape, common) => { | ||
function test (name, testFunction) { | ||
return tape(`TokenPositionClassifier: ${name}`, testFunction) | ||
} | ||
|
||
for (var testCase in module.exports.tests) { | ||
module.exports.tests[testCase](test, common) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters