From dfd1de09afbb5b3c4c7a959b20a1f5fa7011aa46 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 30 Jul 2024 18:44:45 +0000 Subject: [PATCH] HTML API: Add TEMPLATE and related support in HTML Processor. As part of work to add more spec support to the HTML API, this patch adds support for the IN TEMPLATE and IN HEAD insertion modes. These changes are primarily about adding support for TEMPLATE elements in the HTML Processor, but include support for other tags commonly found in the document head, such as LINK, META, SCRIPT, STYLE, and TITLE. Developed in https://github.com/wordpress/wordpress-develop/pull/7046 Discussed in https://core.trac.wordpress.org/ticket/61576 Props: dmsnell, jonsurrell, westonruter. See #61576. git-svn-id: https://develop.svn.wordpress.org/trunk@58833 602fd350-edb4-49c9-b593-d223f7449a82 --- .../html-api/class-wp-html-open-elements.php | 45 ++- .../html-api/class-wp-html-processor.php | 320 +++++++++++++++++- .../html-api/wpHtmlProcessorBreadcrumbs.php | 9 +- .../html-api/wpHtmlProcessorHtml5lib.php | 27 +- 4 files changed, 384 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-open-elements.php b/src/wp-includes/html-api/class-wp-html-open-elements.php index 1162a267f9c9b..c760009ce0c28 100644 --- a/src/wp-includes/html-api/class-wp-html-open-elements.php +++ b/src/wp-includes/html-api/class-wp-html-open-elements.php @@ -308,7 +308,20 @@ public function has_element_in_scope( string $tag_name ): bool { 'MARQUEE', 'OBJECT', 'TEMPLATE', - // @todo: Support SVG and MathML nodes when support for foreign content is added. + + /* + * @todo Support SVG and MathML nodes when support for foreign content is added. + * + * - MathML mi + * - MathML mo + * - MathML mn + * - MathML ms + * - MathML mtext + * - MathML annotation-xml + * - SVG foreignObject + * - SVG desc + * - SVG title + */ ) ); } @@ -349,7 +362,20 @@ public function has_element_in_list_item_scope( string $tag_name ): bool { 'OL', 'TEMPLATE', 'UL', - // @todo: Support SVG and MathML nodes when support for foreign content is added. + + /* + * @todo Support SVG and MathML nodes when support for foreign content is added. + * + * - MathML mi + * - MathML mo + * - MathML mn + * - MathML ms + * - MathML mtext + * - MathML annotation-xml + * - SVG foreignObject + * - SVG desc + * - SVG title + */ ) ); } @@ -386,7 +412,20 @@ public function has_element_in_button_scope( string $tag_name ): bool { 'MARQUEE', 'OBJECT', 'TEMPLATE', - // @todo: Support SVG and MathML nodes when support for foreign content is added. + + /* + * @todo Support SVG and MathML nodes when support for foreign content is added. + * + * - MathML mi + * - MathML mo + * - MathML mn + * - MathML ms + * - MathML mtext + * - MathML annotation-xml + * - SVG foreignObject + * - SVG desc + * - SVG title + */ ) ); } diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f8653022454b6..9f2662c9e4c48 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1040,7 +1040,7 @@ private function step_before_head(): bool { * This internal function performs the 'in head' insertion mode * logic for the generalized WP_HTML_Processor::step() function. * - * @since 6.7.0 Stub implementation. + * @since 6.7.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * @@ -1050,7 +1050,211 @@ private function step_before_head(): bool { * @return bool Whether an element was found. */ private function step_in_head(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + if ( '#text' === $op ) { + $text = $this->get_modifiable_text(); + if ( '' === $text ) { + /* + * If the text is empty after processing HTML entities and stripping + * U+0000 NULL bytes then ignore the token. + */ + return $this->step(); + } + + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + } + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "meta" + */ + case '+META': + $this->insert_html_element( $this->state->current_token ); + + /* + * > If the active speculative HTML parser is null, then: + * > - If the element has a charset attribute, and getting an encoding from + * > its value results in an encoding, and the confidence is currently + * > tentative, then change the encoding to the resulting encoding. + */ + $charset = $this->get_attribute( 'charset' ); + if ( is_string( $charset ) ) { + $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); + } + + /* + * > - Otherwise, if the element has an http-equiv attribute whose value is + * > an ASCII case-insensitive match for the string "Content-Type", and + * > the element has a content attribute, and applying the algorithm for + * > extracting a character encoding from a meta element to that attribute's + * > value returns an encoding, and the confidence is currently tentative, + * > then change the encoding to the extracted encoding. + */ + $http_equiv = $this->get_attribute( 'http-equiv' ); + $content = $this->get_attribute( 'content' ); + if ( + is_string( $http_equiv ) && + is_string( $content ) && + 0 === strcasecmp( $http_equiv, 'Content-Type' ) + ) { + $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); + } + + return true; + + /* + * > A start tag whose tag name is "title" + */ + case '+TITLE': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noscript", if the scripting flag is enabled + * > A start tag whose tag name is one of: "noframes", "style" + * + * The scripting flag is never enabled in this parser. + */ + case '+NOFRAMES': + case '+STYLE': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noscript", if the scripting flag is disabled + */ + case '+NOSCRIPT': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT; + return true; + + /* + * > A start tag whose tag name is "script" + * + * @todo Could the adjusted insertion location be anything other than the current location? + */ + case '+SCRIPT': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "head" + */ + case '-HEAD': + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "body", "html", "br" + */ + case '-BODY': + case '-HTML': + case '-BR': + /* + * > Act as described in the "anything else" entry below. + */ + goto in_head_anything_else; + break; + + /* + * > A start tag whose tag name is "template" + * + * @todo Could the adjusted insertion location be anything other than the current location? + */ + case '+TEMPLATE': + $this->state->active_formatting_elements->insert_marker(); + $this->state->frameset_ok = false; + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "template" + */ + case '-TEMPLATE': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + + $this->generate_implied_end_tags_thoroughly(); + if ( ! $this->state->stack_of_open_elements->current_node_is( 'TEMPLATE' ) ) { + // @todo Indicate a parse error once it's possible. + } + + $this->state->stack_of_open_elements->pop_until( 'TEMPLATE' ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->reset_insertion_mode(); + return true; + } + + /* + * > A start tag whose tag name is "head" + * > Any other end tag + */ + if ( '+HEAD' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + */ + in_head_anything_else: + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -2991,7 +3195,117 @@ private function step_in_select_in_table(): bool { * @return bool Whether an element was found. */ private function step_in_template(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = $this->is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token + * > A comment token + * > A DOCTYPE token + */ + case '#text': + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + case 'html': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link", + * > "meta", "noframes", "script", "style", "template", "title" + * > An end tag whose tag name is "template" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > A start tag whose tag name is one of: "caption", "colgroup", "tbody", "tfoot", "thead" + */ + case '+CAPTION': + case '+COLGROUP': + case '+TBODY': + case '+TFOOT': + case '+THEAD': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is "col" + */ + case '+COL': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is "tr" + */ + case '+TR': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is one of: "td", "th" + */ + case '+TD': + case '+TH': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $is_closer ) { + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other end tag + */ + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > An end-of-file token + */ + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + // Stop parsing. + return false; + } + + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop_until( 'TEMPLATE' ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->reset_insertion_mode(); + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 7dd94747fd8e8..0dbd45cfa0ead 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -46,8 +46,10 @@ public static function data_single_tag_of_supported_elements() { 'ASIDE', 'AUDIO', 'B', + 'BASE', 'BDI', 'BDO', + 'BGSOUND', // Deprectated. 'BIG', 'BLINK', // Deprecated. 'BR', @@ -93,12 +95,14 @@ public static function data_single_tag_of_supported_elements() { 'KEYGEN', // Deprecated. 'LABEL', 'LEGEND', + 'LINK', 'LISTING', // Deprecated. 'MAIN', 'MAP', 'MARK', 'MARQUEE', // Deprecated. 'MENU', + 'META', 'METER', 'MULTICOL', // Deprecated. 'NAV', @@ -178,24 +182,19 @@ public function test_fails_when_encountering_unsupported_tag( $html ) { */ public static function data_unsupported_elements() { $unsupported_elements = array( - 'BASE', - 'BGSOUND', // Deprecated; self-closing if self-closing flag provided, otherwise normal. 'BODY', 'FRAME', 'FRAMESET', 'HEAD', 'HTML', 'IFRAME', - 'LINK', 'MATH', - 'META', 'NOEMBED', // Neutralized. 'NOFRAMES', // Neutralized. 'PLAINTEXT', // Neutralized. 'SCRIPT', 'STYLE', 'SVG', - 'TEMPLATE', 'TEXTAREA', 'TITLE', 'XMP', // Deprecated, use PRE instead. diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 8487df26c99dc..69329f51321ba 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -34,6 +34,7 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', + 'template/line0885' => 'Unimplemented: no parsing of attributes on context node.', 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', @@ -163,25 +164,34 @@ private static function build_tree_representation( ?string $fragment_context, st return null; } - if ( $was_text && '#text' !== $processor->get_token_name() ) { + $token_name = $processor->get_token_name(); + $token_type = $processor->get_token_type(); + $is_closer = $processor->is_tag_closer(); + + if ( $was_text && '#text' !== $token_name ) { $output .= "{$text_node}\"\n"; $was_text = false; $text_node = ''; } - switch ( $processor->get_token_type() ) { + switch ( $token_type ) { case '#tag': - $tag_name = strtolower( $processor->get_tag() ); + $tag_name = strtolower( $token_name ); - if ( $processor->is_tag_closer() ) { + if ( $is_closer ) { --$indent_level; + + if ( 'TEMPLATE' === $token_name ) { + --$indent_level; + } + break; } - $tag_indent = count( $processor->get_breadcrumbs() ) - 1; + $tag_indent = $indent_level; if ( ! WP_HTML_Processor::is_void( $tag_name ) ) { - $indent_level = $tag_indent + 1; + ++$indent_level; } $output .= str_repeat( $indent, $tag_indent ) . "<{$tag_name}>\n"; @@ -209,6 +219,11 @@ private static function build_tree_representation( ?string $fragment_context, st $output .= str_repeat( $indent, $indent_level ) . "\"{$modifiable_text}\"\n"; } + if ( 'TEMPLATE' === $token_name ) { + $output .= str_repeat( $indent, $indent_level ) . "content\n"; + ++$indent_level; + } + if ( ! $processor->is_void( $tag_name ) && ! $processor->expects_closer() ) { --$indent_level; }