Skip to content

Commit

Permalink
parseBoundaries
Browse files Browse the repository at this point in the history
  • Loading branch information
kojiishi committed Aug 12, 2023
1 parent e89b3e5 commit e05af81
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 27 deletions.
33 changes: 10 additions & 23 deletions javascript/src/html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -319,31 +319,18 @@ export class HTMLProcessor {
// No changes if whitespace-only.
if (/^\s*$/.test(text)) return;

// Split the text into a list of phrases.
const phrases = this.parser_.parse(text);
assert(phrases.length > 0);
assert(
phrases.reduce((sum, phrase) => sum + phrase.length, 0) === text.length
);
// Compute the phrase boundaries.
const boundaries = this.parser_.parseBoundaries(text);
// No changes if single phrase.
if (phrases.length <= 1) return;

// Compute the boundary indices from the list of phrase strings.
const boundaries = [];
let char_index = 0;
for (const phrase of phrases) {
assert(phrase.length > 0);
char_index += phrase.length;
boundaries.push(char_index);
}

// The break opportunity at the end of a block is not needed. Instead of
// removing it, turn it to a sentinel for `splitTextNodesAtBoundaries` by
// making it larger than the text length.
if (boundaries.length <= 0) return;
// The boundaries should be between 1 and `text.length - 1` in the
// ascending order.
assert(boundaries[0] > 0);
assert(boundaries[boundaries.length - 1] === text.length);
++boundaries[boundaries.length - 1];
assert(boundaries.length > 1);
assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
assert(boundaries[boundaries.length - 1] < text.length);

// Add a sentinel to help iterating.
boundaries.push(text.length + 1);

this.splitTextNodes(textNodes, boundaries);
this.applyBlockStyle(paragraph.element);
Expand Down
25 changes: 21 additions & 4 deletions javascript/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,27 @@ export class Parser {
* @param sentence An input sentence.
* @returns The retrieved chunks.
*/
parse(sentence: string) {
parse(sentence: string): string[] {
if (sentence === '') return [];
const result = [sentence[0]];
const boundaries = this.parseBoundaries(sentence);
const result = [];
let start = 0;
for (const boundary of boundaries) {
result.push(sentence.slice(start, boundary));
start = boundary;
}
result.push(sentence.slice(start));
return result;
}

/**
* Parses the input sentence and returns a list of boundaries.
*
* @param sentence An input sentence.
* @returns The list of boundaries.
*/
parseBoundaries(sentence: string): number[] {
const result = [];
const baseScore =
-0.5 *
sum([...this.model.values()].map(group => [...group.values()]).flat());
Expand All @@ -81,8 +99,7 @@ export class Parser {
score += this.model.get('TW2')?.get(sentence.slice(i - 2, i + 1)) || 0;
score += this.model.get('TW3')?.get(sentence.slice(i - 1, i + 2)) || 0;
score += this.model.get('TW4')?.get(sentence.slice(i, i + 3)) || 0;
if (score > 0) result.push('');
result[result.length - 1] += sentence[i];
if (score > 0) result.push(i);
}
return result;
}
Expand Down

0 comments on commit e05af81

Please sign in to comment.