From e66738bc9fe3a21d7aea613a0ba536ccd09f7df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Fri, 8 Mar 2019 08:31:34 +0100 Subject: [PATCH] block-serialization-default-parser: set up auto-generated API docs (#14280) --- bin/update-readmes.js | 2 +- .../README.md | 45 ++++++++--- .../src/index.js | 77 +++++++++++++++++++ 3 files changed, 112 insertions(+), 12 deletions(-) diff --git a/bin/update-readmes.js b/bin/update-readmes.js index a9acb6c9a1de6..b43b4fbd515e0 100755 --- a/bin/update-readmes.js +++ b/bin/update-readmes.js @@ -10,7 +10,7 @@ const packages = [ 'blob', 'block-editor', 'block-library', - //'block-serialization-default-parser', + 'block-serialization-default-parser', 'blocks', 'compose', //'data', diff --git a/packages/block-serialization-default-parser/README.md b/packages/block-serialization-default-parser/README.md index 254d8f7bdcd19..d64533b36b341 100644 --- a/packages/block-serialization-default-parser/README.md +++ b/packages/block-serialization-default-parser/README.md @@ -12,9 +12,20 @@ npm install @wordpress/block-serialization-default-parser --save _This package assumes that your code will run in an **ES2015+** environment. If you're using an environment that has limited or no support for ES2015+ such as lower versions of IE then using [core-js](https://github.com/zloirock/core-js) or [@babel/polyfill](https://babeljs.io/docs/en/next/babel-polyfill) will add support for these methods. Learn more about it in [Babel docs](https://babeljs.io/docs/en/next/caveats)._ -## Usage +## API + + + +### parse + +[src/index.js#L150-L162](src/index.js#L150-L162) + +Parser function, that converts input HTML into a block based structure. + +**Usage** Input post: + ```html
@@ -36,6 +47,7 @@ Input post: ``` Parsing code: + ```js import { parse } from '@wordpress/block-serialization-default-parser'; @@ -84,6 +96,17 @@ parse( post ) === [ ]; ``` +**Parameters** + +- **doc** `string`: The HTML document to parse. + +**Returns** + +`Array`: A block-based representation of the input HTML. + + + + ## Theory ### What is different about this one from the spec-parser? @@ -98,26 +121,26 @@ Every serialized Gutenberg document is nominally an HTML document which, in addi This parser attempts to create a state-machine around the transitions triggered from those delimiters -- the "tokens" of the grammar. Every time we find one we should only be doing either of: - - enter a new block; - - exit out of a block. +- enter a new block; +- exit out of a block. Those actions have different effects depending on the context; for instance, when we exit a block we either need to add it to the output block list _or_ we need to append it as the next `innerBlock` on the parent block below it in the block stack (the place where we track open blocks). The details are documented below. The biggest challenge in this parser is making the right accounting of indices required to construct the `innerHTML` values for each block at every level of nesting depth. We take a simple approach: - - Start each newly opened block with an empty `innerHTML`. - - Whenever we push a first block into the `innerBlocks` list, add the content from where the content of the parent block started to where this inner block starts. - - Whenever we push another block into the `innerBlocks` list, add the content from where the previous inner block ended to where this inner block starts. - - When we close out an open block, add the content from where the last inner block ended to where the closing block delimiter starts. - - If there are no inner blocks then we take the entire content between the opening and closing block comment delimiters as the `innerHTML`. +- Start each newly opened block with an empty `innerHTML`. +- Whenever we push a first block into the `innerBlocks` list, add the content from where the content of the parent block started to where this inner block starts. +- Whenever we push another block into the `innerBlocks` list, add the content from where the previous inner block ended to where this inner block starts. +- When we close out an open block, add the content from where the last inner block ended to where the closing block delimiter starts. +- If there are no inner blocks then we take the entire content between the opening and closing block comment delimiters as the `innerHTML`. ### I meant, how does it perform? This parser operates much faster than the generated parser from the specification. Because we know more about the parsing than the PEG does we can take advantage of several tricks to improve our speed and memory usage: - - We only have one or two distinct tokens, depending on how you look at it, and they are all readily matched via a regular expression. Instead of parsing on a character-per-character basis we can allow the PCRE RegExp engine to skip over large swaths of the document for us in order to find those tokens. - - Since `preg_match()` takes an `offset` parameter we can crawl through the input without passing copies of the input text on every step. We can track our position in the string and only pass a number instead. - - Not copying all those strings means that we'll also skip many memory allocations. +- We only have one or two distinct tokens, depending on how you look at it, and they are all readily matched via a regular expression. Instead of parsing on a character-per-character basis we can allow the PCRE RegExp engine to skip over large swaths of the document for us in order to find those tokens. +- Since `preg_match()` takes an `offset` parameter we can crawl through the input without passing copies of the input text on every step. We can track our position in the string and only pass a number instead. +- Not copying all those strings means that we'll also skip many memory allocations. Further, tokenizing with a RegExp brings an additional advantage. The parser generated by the PEG provides predictable performance characteristics in exchange for control over tokenization rules -- it doesn't allow us to define RegExp patterns in the rules so as to guard against _e.g._ cataclysmic backtracking that would break the PEG guarantees. diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js index 489f1f76bb1bb..e885e57af6269 100644 --- a/packages/block-serialization-default-parser/src/index.js +++ b/packages/block-serialization-default-parser/src/index.js @@ -70,6 +70,83 @@ function Frame( block, tokenStart, tokenLength, prevOffset, leadingHtmlStart ) { }; } +/** + * Parser function, that converts input HTML into a block based structure. + * + * @param {string} doc The HTML document to parse. + * + * @example + * Input post: + * ```html + * + *
+ *
+ *

Left

+ *
+ * + * + * + *
+ *

Middle

+ *
+ * + * + * + *
+ *
+ * + * ``` + * + * Parsing code: + * ```js + * import { parse } from '@wordpress/block-serialization-default-parser'; + * + * parse( post ) === [ + * { + * blockName: "core/columns", + * attrs: { + * columns: 3 + * }, + * innerBlocks: [ + * { + * blockName: "core/column", + * attrs: null, + * innerBlocks: [ + * { + * blockName: "core/paragraph", + * attrs: null, + * innerBlocks: [], + * innerHTML: "\n

Left

\n" + * } + * ], + * innerHTML: '\n
\n' + * }, + * { + * blockName: "core/column", + * attrs: null, + * innerBlocks: [ + * { + * blockName: "core/paragraph", + * attrs: null, + * innerBlocks: [], + * innerHTML: "\n

Middle

\n" + * } + * ], + * innerHTML: '\n
\n' + * }, + * { + * blockName: "core/column", + * attrs: null, + * innerBlocks: [], + * innerHTML: '\n
\n' + * } + * ], + * innerHTML: '\n
\n\n\n\n
\n' + * } + * ]; + * ``` + * @return {Array} A block-based representation of the input HTML. + */ export const parse = ( doc ) => { document = doc; offset = 0;