Skip to content

Commit

Permalink
Avoid inserting separators to where the source has one (#342)
Browse files Browse the repository at this point in the history
* Avoid inserting separators to where the source has one

This patch changes the `HtmlProcessor` not to insert separators
(`<wbr>`, `&ZeroWidthSpace;`, or whatever the caller specified) to where
the source has `<wbr>` or `&ZeroWidthSpace;`.

This is to avoid inserting separators multiple times to the same place
when `HtmlProcessor` is applied multiple times.

* tushuhei review
  • Loading branch information
kojiishi authored Oct 25, 2023
1 parent b0a4748 commit 65dbb4a
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 37 deletions.
68 changes: 63 additions & 5 deletions javascript/src/html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ import {win} from './win.js';

const assert = console.assert;

const ZWSP = '\u200B'; // U+200B ZERO WIDTH SPACE
const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);

// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
// but we define the same here for Node.js environments.
Expand All @@ -35,6 +36,7 @@ const DomAction = {
Skip: 2, // Skip the content. The content before and after are connected.
Break: 3, // A forced break. The content before and after become paragraphs.
NoBreak: 4, // The content provides context, but it's not breakable.
BreakOpportunity: 5, // Force a break opportunity.
} as const;
type DomAction = (typeof DomAction)[keyof typeof DomAction];

Expand Down Expand Up @@ -76,6 +78,7 @@ const domActions: {[name: string]: DomAction} = {
// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
BR: DomAction.Break,
RT: DomAction.Skip,
WBR: DomAction.BreakOpportunity,

// Form controls
// https://html.spec.whatwg.org/multipage/rendering.html#form-controls
Expand Down Expand Up @@ -198,9 +201,10 @@ function actionForElement(element: Element): DomAction {
*
* A {@link string} provides the context for the parser, but it can't be split.
*/
export class NodeOrText {
class NodeOrText {
nodeOrText: Text | string;
chunks: string[] = [];
hasBreakOpportunityAfter = false;

constructor(nodeOrText: Text | string) {
this.nodeOrText = nodeOrText;
Expand Down Expand Up @@ -254,6 +258,7 @@ export class NodeOrText {
node.replaceWith(...nodes);
}
}
export class NodeOrTextForTesting extends NodeOrText {}

/**
* Represents a "paragraph", broken by block boundaries or forced breaks.
Expand All @@ -277,7 +282,54 @@ class Paragraph {
get text(): string {
return this.nodes.map(node => node.text).join('');
}

get lastNode(): NodeOrText | undefined {
return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
}
setHasBreakOpportunityAfter() {
const lastNode = this.lastNode;
if (lastNode) lastNode.hasBreakOpportunityAfter = true;
}

/**
* @returns Indices of forced break opportunities in the source.
* They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
*/
getForcedOpportunities(): number[] {
const opportunities: number[] = [];
let len = 0;
for (const node of this.nodes) {
if (node.canSplit) {
const text = node.text;
if (text) {
for (let i = 0; i < text.length; ++i) {
if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
opportunities.push(len + i + 1);
}
}
}
}
len += node.length;
if (node.hasBreakOpportunityAfter) {
opportunities.push(len);
}
}
return opportunities;
}

/**
* @returns Filtered {@param boundaries} by excluding
* {@link getForcedOpportunities} if it's not empty.
* Otherwise {@param boundaries}.
*/
excludeForcedOpportunities(boundaries: number[]): number[] {
const forcedOpportunities = this.getForcedOpportunities();
if (!forcedOpportunities.length) return boundaries;
const set = new Set<number>(forcedOpportunities);
return boundaries.filter(i => !set.has(i));
}
}
export class ParagraphForTesting extends Paragraph {}

/**
* Options for {@link HTMLProcessor}.
Expand Down Expand Up @@ -366,15 +418,19 @@ export class HTMLProcessor {

const action = actionForElement(element);
if (action === DomAction.Skip) return;

if (action === DomAction.Break) {
if (parent && !parent.isEmpty()) {
parent.setHasBreakOpportunityAfter();
yield parent;
parent.nodes = [];
}
assert(!element.firstChild);
return;
}
if (action === DomAction.BreakOpportunity) {
if (parent) parent.setHasBreakOpportunityAfter();
return;
}

// Determine if this element creates a new inline formatting context, or if
// this element belongs to the parent inline formatting context.
Expand Down Expand Up @@ -432,10 +488,12 @@ export class HTMLProcessor {
assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
assert(boundaries[boundaries.length - 1] < text.length);

const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);

// Add a sentinel to help iterating.
boundaries.push(text.length + 1);
adjustedBoundaries.push(text.length + 1);

this.splitNodes(paragraph.nodes, boundaries);
this.splitNodes(paragraph.nodes, adjustedBoundaries);
this.applyBlockStyle(paragraph.element);
}

Expand Down
111 changes: 79 additions & 32 deletions javascript/src/tests/test_html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ import {
HTMLProcessingParser,
HTMLProcessor,
HTMLProcessorOptions,
NodeOrText,
NodeOrTextForTesting,
ParagraphForTesting,
} from '../html_processor.js';
import {win} from '../win.js';
import {parseFromString, setInnerHtml} from '../dom.js';
Expand All @@ -32,6 +33,13 @@ class MockHTMLProcessorBase extends HTMLProcessor {
}
}

function getBlocks(html: string): IterableIterator<ParagraphForTesting> {
const document = win.document;
setInnerHtml(document.body, html);
const processor = new MockHTMLProcessorBase();
return processor.getBlocks(document.body);
}

describe('HTMLProcessor.applyToElement', () => {
const document = win.document;
const wbr = document.createElement('wbr');
Expand Down Expand Up @@ -117,84 +125,108 @@ describe('HTMLProcessor.applyToElement.separator.node', () => {
});

describe('HTMLProcessor.getBlocks', () => {
const getBlocks = (html: string) => {
const document = win.document;
setInnerHtml(document.body, html);
const processor = new MockHTMLProcessorBase();
const blocks = processor.getBlocks(document.body);
const texts = Array.from(
function getText(html: string): string[] {
const blocks = getBlocks(html);
return Array.from(
(function* (blocks) {
for (const block of blocks) yield block.text;
})(blocks)
);
return texts;
};
}

it('should collect all text of a simple block', () => {
expect(getBlocks('<div>123</div>')).toEqual(['123']);
expect(getText('<div>123</div>')).toEqual(['123']);
});

it('should collect two blocks separately', () => {
expect(getBlocks('<div>123</div><div>456</div>')).toEqual(['123', '456']);
expect(getText('<div>123</div><div>456</div>')).toEqual(['123', '456']);
});

it('should break at <br> elements', () => {
expect(getBlocks('<div>123<br>456</div>')).toEqual(['123', '456']);
expect(getText('<div>123<br>456</div>')).toEqual(['123', '456']);
});

it('should break at <br> elements inside a span', () => {
expect(getBlocks('<div>1<span>23<br>45</span>6</div>')).toEqual([
expect(getText('<div>1<span>23<br>45</span>6</div>')).toEqual([
'123',
'456',
]);
});

it('should collect inline boxes as part of the block', () => {
expect(getBlocks('<div>123<span>456</span>789</div>')).toEqual([
'123456789',
]);
expect(getText('<div>123<span>456</span>789</div>')).toEqual(['123456789']);
});

it('should collect nested blocks separately from the parent block', () => {
expect(getBlocks('<div>123<div>456</div>789</div>')).toEqual([
expect(getText('<div>123<div>456</div>789</div>')).toEqual([
'456',
'123789',
]);
});

it('should collect inline-blocks separately from the parent block', () => {
expect(
getBlocks('<div>123<div style="display: inline-block">456</div>789</div>')
getText('<div>123<div style="display: inline-block">456</div>789</div>')
).toEqual(['456', '123789']);
expect(
getBlocks(
'<div>123<span style="display: inline-block">456</span>789</div>'
)
getText('<div>123<span style="display: inline-block">456</span>789</div>')
).toEqual(['456', '123789']);
});

it('should skip textarea elements', () => {
expect(getBlocks('<textarea>123</textarea>')).toEqual([]);
expect(getText('<textarea>123</textarea>')).toEqual([]);
});

it('should skip <rt> and <rp> elements for <ruby>', () => {
expect(
getBlocks('before<ruby>b1<rp>(</rp><rt>r1</rt>b2<rt>r2</rt></ruby>after')
getText('before<ruby>b1<rp>(</rp><rt>r1</rt>b2<rt>r2</rt></ruby>after')
).toEqual(['beforeb1b2after']);
});

it('should use the built-in rules if the `display` property is empty', () => {
expect(getBlocks('<div>123<span>456</span></div>')).toEqual(['123456']);
expect(getBlocks('<div>123<div>456</div></div>')).toEqual(['456', '123']);
expect(getBlocks('<div><h1>123</h1><li>456</li></div>')).toEqual([
expect(getText('<div>123<span>456</span></div>')).toEqual(['123456']);
expect(getText('<div>123<div>456</div></div>')).toEqual(['456', '123']);
expect(getText('<div><h1>123</h1><li>456</li></div>')).toEqual([
'123',
'456',
]);
});
});

describe('HTMLProcessor.forcedOpportunities', () => {
function forcedOpportunities(html: string) {
const blocks = getBlocks(html);
return Array.from(
(function* (blocks) {
for (const block of blocks) {
yield {
indices: block.getForcedOpportunities(),
after: block.nodes.map(block => block.hasBreakOpportunityAfter),
};
}
})(blocks)
);
}

it('<wbr> should set has_break_opportunity_after', () => {
expect(forcedOpportunities('123<wbr>456')).toEqual([
{indices: [3], after: [true, false]},
]);
});
it('Nested <wbr> should set has_break_opportunity_after', () => {
expect(forcedOpportunities('123<span><wbr></span>456')).toEqual([
{indices: [3], after: [true, false]},
]);
});
it('ZWSP should be in forcedOpportunities', () => {
expect(forcedOpportunities('123<span>\u200B456</span>')).toEqual([
{indices: [4], after: [false, false]},
]);
});
});

describe('HTMLProcessor.splitNodes', () => {
class MockNode extends NodeOrText {
class MockNode extends NodeOrTextForTesting {
constructor(text: string) {
super(text);
}
Expand Down Expand Up @@ -298,12 +330,11 @@ describe('HTMLProcessingParser.applyElement', () => {
const expectedElement = expectedDocument.querySelector('p') as HTMLElement;
expect(inputDocument.isEqualNode(expectedElement)).toBeTrue();
};
const style = 'word-break: keep-all; overflow-wrap: anywhere;';

it('should insert WBR tags where the sentence should break.', () => {
const inputHTML = '<p>xyzabcabc</p>';
const expectedHTML = `
<p style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<wbr>abc<wbr>abc</p>`;
const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
Expand All @@ -312,8 +343,24 @@ describe('HTMLProcessingParser.applyElement', () => {

it('should insert WBR tags even it overlaps with other HTML tags.', () => {
const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>';
const expectedHTML = `<p style="word-break: keep-all; overflow-wrap: anywhere;"
>xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
const expectedHTML = `<p style="${style}">xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});

it('should not insert WBR tags to where input has WBR tags.', () => {
const inputHTML = '<p>xyz<wbr>abcabc</p>';
const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});
it('should not insert WBR tags to where input has ZWSP.', () => {
const inputHTML = '<p>xyz\u200Babcabc</p>';
const expectedHTML = `<p style="${style}">xyz\u200babc<wbr>abc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
Expand Down

0 comments on commit 65dbb4a

Please sign in to comment.