diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts
index de8865a8..f2ced89a 100644
--- a/javascript/src/html_processor.ts
+++ b/javascript/src/html_processor.ts
@@ -319,31 +319,18 @@ export class HTMLProcessor {
// No changes if whitespace-only.
if (/^\s*$/.test(text)) return;
- // Split the text into a list of phrases.
- const phrases = this.parser_.parse(text);
- assert(phrases.length > 0);
- assert(
- phrases.reduce((sum, phrase) => sum + phrase.length, 0) === text.length
- );
+ // Compute the phrase boundaries.
+ const boundaries = this.parser_.parseBoundaries(text);
// No changes if single phrase.
- if (phrases.length <= 1) return;
-
- // Compute the boundary indices from the list of phrase strings.
- const boundaries = [];
- let char_index = 0;
- for (const phrase of phrases) {
- assert(phrase.length > 0);
- char_index += phrase.length;
- boundaries.push(char_index);
- }
-
- // The break opportunity at the end of a block is not needed. Instead of
- // removing it, turn it to a sentinel for `splitTextNodesAtBoundaries` by
- // making it larger than the text length.
+ if (boundaries.length <= 0) return;
+ // The boundaries should be between 1 and `text.length - 1` in the
+ // ascending order.
assert(boundaries[0] > 0);
- assert(boundaries[boundaries.length - 1] === text.length);
- ++boundaries[boundaries.length - 1];
- assert(boundaries.length > 1);
+ assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
+ assert(boundaries[boundaries.length - 1] < text.length);
+
+ // Add a sentinel to help iterating.
+ boundaries.push(text.length + 1);
this.splitTextNodes(textNodes, boundaries);
this.applyBlockStyle(paragraph.element);
diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts
index 4825e5f6..e8e4e537 100644
--- a/javascript/src/parser.ts
+++ b/javascript/src/parser.ts
@@ -59,9 +59,27 @@ export class Parser {
* @param sentence An input sentence.
* @returns The retrieved chunks.
*/
- parse(sentence: string) {
+ parse(sentence: string): string[] {
if (sentence === '') return [];
- const result = [sentence[0]];
+ const boundaries = this.parseBoundaries(sentence);
+ const result = [];
+ let start = 0;
+ for (const boundary of boundaries) {
+ result.push(sentence.slice(start, boundary));
+ start = boundary;
+ }
+ result.push(sentence.slice(start));
+ return result;
+ }
+
+ /**
+ * Parses the input sentence and returns a list of boundaries.
+ *
+ * @param sentence An input sentence.
+ * @returns The list of boundaries.
+ */
+ parseBoundaries(sentence: string): number[] {
+ const result = [];
const baseScore =
-0.5 *
sum([...this.model.values()].map(group => [...group.values()]).flat());
@@ -81,8 +99,7 @@ export class Parser {
score += this.model.get('TW2')?.get(sentence.slice(i - 2, i + 1)) || 0;
score += this.model.get('TW3')?.get(sentence.slice(i - 1, i + 2)) || 0;
score += this.model.get('TW4')?.get(sentence.slice(i, i + 3)) || 0;
- if (score > 0) result.push('');
- result[result.length - 1] += sentence[i];
+ if (score > 0) result.push(i);
}
return result;
}