From e05af818c6cb270ac321a39bf7b8f2fb4c42a08e Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Sat, 12 Aug 2023 18:16:46 +0900 Subject: [PATCH] parseBoundaries --- javascript/src/html_processor.ts | 33 ++++++++++---------------------- javascript/src/parser.ts | 25 ++++++++++++++++++++---- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts index de8865a8..f2ced89a 100644 --- a/javascript/src/html_processor.ts +++ b/javascript/src/html_processor.ts @@ -319,31 +319,18 @@ export class HTMLProcessor { // No changes if whitespace-only. if (/^\s*$/.test(text)) return; - // Split the text into a list of phrases. - const phrases = this.parser_.parse(text); - assert(phrases.length > 0); - assert( - phrases.reduce((sum, phrase) => sum + phrase.length, 0) === text.length - ); + // Compute the phrase boundaries. + const boundaries = this.parser_.parseBoundaries(text); // No changes if single phrase. - if (phrases.length <= 1) return; - - // Compute the boundary indices from the list of phrase strings. - const boundaries = []; - let char_index = 0; - for (const phrase of phrases) { - assert(phrase.length > 0); - char_index += phrase.length; - boundaries.push(char_index); - } - - // The break opportunity at the end of a block is not needed. Instead of - // removing it, turn it to a sentinel for `splitTextNodesAtBoundaries` by - // making it larger than the text length. + if (boundaries.length <= 0) return; + // The boundaries should be between 1 and `text.length - 1` in the + // ascending order. assert(boundaries[0] > 0); - assert(boundaries[boundaries.length - 1] === text.length); - ++boundaries[boundaries.length - 1]; - assert(boundaries.length > 1); + assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1])); + assert(boundaries[boundaries.length - 1] < text.length); + + // Add a sentinel to help iterating. + boundaries.push(text.length + 1); this.splitTextNodes(textNodes, boundaries); this.applyBlockStyle(paragraph.element); diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts index 4825e5f6..e8e4e537 100644 --- a/javascript/src/parser.ts +++ b/javascript/src/parser.ts @@ -59,9 +59,27 @@ export class Parser { * @param sentence An input sentence. * @returns The retrieved chunks. */ - parse(sentence: string) { + parse(sentence: string): string[] { if (sentence === '') return []; - const result = [sentence[0]]; + const boundaries = this.parseBoundaries(sentence); + const result = []; + let start = 0; + for (const boundary of boundaries) { + result.push(sentence.slice(start, boundary)); + start = boundary; + } + result.push(sentence.slice(start)); + return result; + } + + /** + * Parses the input sentence and returns a list of boundaries. + * + * @param sentence An input sentence. + * @returns The list of boundaries. + */ + parseBoundaries(sentence: string): number[] { + const result = []; const baseScore = -0.5 * sum([...this.model.values()].map(group => [...group.values()]).flat()); @@ -81,8 +99,7 @@ export class Parser { score += this.model.get('TW2')?.get(sentence.slice(i - 2, i + 1)) || 0; score += this.model.get('TW3')?.get(sentence.slice(i - 1, i + 2)) || 0; score += this.model.get('TW4')?.get(sentence.slice(i, i + 3)) || 0; - if (score > 0) result.push(''); - result[result.length - 1] += sentence[i]; + if (score > 0) result.push(i); } return result; }