From 91d80494094cfe9c018621dd6a463fd130fbed97 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Oct 2023 15:59:10 -0700 Subject: [PATCH 1/5] WIP: Rely on Unicode processing within a RegExp pattern for title sanitization. From time to time new issues arise with the sanitization of a post title for slug creation. The existing algorithm builds takes a hit-and-miss approach of handling specific cases of known string elements that cause problems and replacing them with normalized characters. In this patch a given title is first converted into a normalized form and then processed with a Unicode-aware PCRE pattern which formalizes the _kinds_ of replacements which are supposed to occur. For example, instead of removing "%c2%ab" the code now removes "invisible characters" as defined by Unicode itself. This update, if it works without breaking existing dependencies, poses a more comprehensive solution to the problem of slug generation, one that updates with advancements to the Unicode specification provided by system libraries and PHP itself instead of through custom WordPress code. --- src/wp-includes/formatting.php | 113 ++++++++------------------------- 1 file changed, 26 insertions(+), 87 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index cd1ee2689489e..748992e589afa 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2260,111 +2260,50 @@ function sanitize_title_for_query( $title ) { */ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) { $title = strip_tags( $title ); - // Preserve escaped octets. - $title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title ); - // Remove percent signs that are not part of an octet. - $title = str_replace( '%', '', $title ); - // Restore octets. - $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title ); if ( seems_utf8( $title ) ) { if ( function_exists( 'mb_strtolower' ) ) { $title = mb_strtolower( $title, 'UTF-8' ); } + $title = utf8_uri_encode( $title, 200 ); } $title = strtolower( $title ); if ( 'save' === $context ) { - // Convert  , &ndash, and &mdash to hyphens. - $title = str_replace( array( '%c2%a0', '%e2%80%93', '%e2%80%94' ), '-', $title ); - // Convert  , &ndash, and &mdash HTML entities to hyphens. - $title = str_replace( array( ' ', ' ', '–', '–', '—', '—' ), '-', $title ); - // Convert forward slash to hyphen. - $title = str_replace( '/', '-', $title ); - - // Strip these characters entirely. - $title = str_replace( - array( - // Soft hyphens. - '%c2%ad', - // ¡ and ¿. - '%c2%a1', - '%c2%bf', - // Angle quotes. - '%c2%ab', - '%c2%bb', - '%e2%80%b9', - '%e2%80%ba', - // Curly quotes. - '%e2%80%98', - '%e2%80%99', - '%e2%80%9c', - '%e2%80%9d', - '%e2%80%9a', - '%e2%80%9b', - '%e2%80%9e', - '%e2%80%9f', - // Bullet. - '%e2%80%a2', - // ©, ®, °, &hellip, and &trade. - '%c2%a9', - '%c2%ae', - '%c2%b0', - '%e2%80%a6', - '%e2%84%a2', - // Acute accents. - '%c2%b4', - '%cb%8a', - '%cc%81', - '%cd%81', - // Grave accent, macron, caron. - '%cc%80', - '%cc%84', - '%cc%8c', - // Non-visible characters that display without a width. - '%e2%80%8b', // Zero width space. - '%e2%80%8c', // Zero width non-joiner. - '%e2%80%8d', // Zero width joiner. - '%e2%80%8e', // Left-to-right mark. - '%e2%80%8f', // Right-to-left mark. - '%e2%80%aa', // Left-to-right embedding. - '%e2%80%ab', // Right-to-left embedding. - '%e2%80%ac', // Pop directional formatting. - '%e2%80%ad', // Left-to-right override. - '%e2%80%ae', // Right-to-left override. - '%ef%bb%bf', // Byte order mark. - '%ef%bf%bc', // Object replacement character. - ), - '', + $title = html_entity_decode( $title, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ); + $title = rawurldecode( $title ); + // Dash/hyphen symbols plus whitespace turn into a dash. + $title = preg_replace( '~[\p{Pd}\s/]+~u', '-', $title ); + // Control characters, combining marks, symbols, punctuation, and invisible characters are removed. + $title = preg_replace_callback( + '~-[_-]+|[\p{C}\p{M}\p{S}\p{P}\p{Z}]+~u', + static function ( $chunk ) { + switch ( $chunk[0] ) { + case '-': + return '-'; + + case '_': + return '_'; + + default: + return ''; + } + }, $title ); - - // Convert non-visible characters that display with a width to hyphen. $title = str_replace( array( - '%e2%80%80', // En quad. - '%e2%80%81', // Em quad. - '%e2%80%82', // En space. - '%e2%80%83', // Em space. - '%e2%80%84', // Three-per-em space. - '%e2%80%85', // Four-per-em space. - '%e2%80%86', // Six-per-em space. - '%e2%80%87', // Figure space. - '%e2%80%88', // Punctuation space. - '%e2%80%89', // Thin space. - '%e2%80%8a', // Hair space. - '%e2%80%a8', // Line separator. - '%e2%80%a9', // Paragraph separator. - '%e2%80%af', // Narrow no-break space. + "\xD7", // Replace multiplication sign "×" with "x". + '�', // Remove invalid decoded characters. + ), + array( + 'x', + '', ), - '-', $title ); - - // Convert × to 'x'. - $title = str_replace( '%c3%97', 'x', $title ); } // Remove HTML entities. From 54d8afec36f9ed845c883e128ac3cf85b4a22e91 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 23 Jun 2024 13:49:00 +0100 Subject: [PATCH 2/5] Build fully Unicode-aware slugifier. --- src/wp-includes/formatting.php | 452 +++++++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 748992e589afa..5616d388a5a20 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2243,6 +2243,458 @@ function sanitize_title_for_query( $title ) { return sanitize_title( $title, '', 'query' ); } +/** + * Translates a code point into a sequence of bytes as if encoded into UTF-8, + * but ignoring the restriction that surrogates halves must not be encoded. + * + * Avoid using this function, as it's meant to be used internally in sensitive + * and controlled environments. It can be used to generate invalid UTF-8. + * + * @access private + * + * @since {WP_VERSION} + * + * @param int $code_point Any Unicode code point, including the unassigned surrogate half values. + * @return string|null Bytes encoded using the UTF-8 algorithm, which might be invalid UTF-8, + * if possible to encode, otherwise `null`. + */ +function utf8_naive_codepoint_to_bytes( int $code_point ): ?string { + if ( 0 > $code_point || 0x10FFFF < $code_point ) { + return null; + } + + if ( $code_point <= 0x7F ) { + return chr( $code_point ); + } + + if ( $code_point <= 0x7FF ) { + $byte1 = ( $code_point >> 6 ) | 0xC0; + $byte2 = $code_point & 0x3F | 0x80; + + return pack( 'CC', $byte1, $byte2 ); + } + + if ( $code_point <= 0xFFFF ) { + $byte1 = ( $code_point >> 12 ) | 0xE0; + $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte3 = $code_point & 0x3F | 0x80; + + return pack( 'CCC', $byte1, $byte2, $byte3 ); + } + + // Any values above U+10FFFF are eliminated above in the pre-check. + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; + + return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); +} + +/** + * Attempts to read a UTF-8 percent-escaped code point in the given + * text at the given starting point, measured in bytes. + * + * Example: + * + * null === read_utf8_percent_escaped_code_point( 'Unicode', 0 ); + * "…" === "\xC2\xA0" === read_utf8_percent_escaped_code_point( 'White%C2%A0Space', 5 ); + * null === read_utf8_percent_escaped_code_point( '%A0Is   in ISO-8859-1', 0, $matched_byte_length ); + * 3 === $matched_byte_length; + * "🅰" === read_utf8_percent_escaped_code_point( '%F0%9F%85%B0', 0, $matched_byte_length, $code_point ); + * 12 === $matched_byte_length; + * 0x1F170 === $code_point; + * + * @since {WP_VERSION} + * + * @param string $text Text potentially containing percent-escapes. + * @param int $starting_at_byte Where to start looking for the escaped code point. + * @param int|null $matched_byte_length Optional. When provided, is set to the number of bytes + * scanned in the given text to find a code point. It may + * be non-zero when no code point is found, if an invalid + * UTF-8 byte sequence had been decoded otherwise. + * @param int|null $code_point Optional. When provided and a code point is decoded, will + * be set to the value of the code point, otherwise not set. + * @return string|null Decoded code point in UTF-8 bytes, or false if none found. + */ +function read_utf8_percent_escaped_code_point( string $text, int $starting_at_byte, int &$matched_byte_length = null, int &$code_point = null ): ?string { + $at = $starting_at_byte; + $end = strlen( $text ); + $buffer = ''; + $need = null; + + /** + * Indicates how many bytes are expected for a given leading byte. + */ + $length_table = "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x02\x02\x02\x02\x03\x03\x04\x00"; + + $matched_byte_length = 0; + while ( $at < $end ) { + $byte = $text[ $at ]; + + if ( '%' !== $byte || $at + 2 >= $end ) { + break; + } + + $leading_byte = wp_hex_to_int( $text, $at + 1, 2 ); + if ( null === $leading_byte ) { + break; + } + + if ( ! isset( $need ) ) { + $need = ord( $length_table[ $leading_byte >> 3 ] ); + } + + if ( 1 === $need && '' === $buffer ) { + $matched_byte_length = 3; + $code_point = $leading_byte; + return chr( $leading_byte ); + } + + if ( $leading_byte < 0x80 ) { + break; + } + + $buffer .= chr( $leading_byte ); + $at += 3; + } + + $matched_byte_length = $at - $starting_at_byte; + + /* + * At this point the buffer should be full and equal in length to the expected + * byte need. If it isn't, or if those bytes aren't valid UTF-8, this should fail. + */ + $buffer_length = strlen( $buffer ); + if ( $buffer_length !== $need ) { + return null; + } + + $state = 0; + $code_point = utf8_read_next_code_point( $buffer, 0, $state ); + + return 0 === $state ? $buffer : null; +} + +/** + * Decodes UTF-8 encoded bytes in a string at a given starting byte offset. + * + * @since {WP_VERSION} + * + * @param string $text UTF-8 text to decode. + * @param int $starting_byte Byte offset into text where next code point starts. + * @param int $state Error tracker passed through multiple invocations of this function. + * A non-zero value indicates that there is an error. + * @param int $matched_bytes Optional. Set to how many bytes were consumed while parsing the code point. + * @return int|null Decoded code point if found, else `null`. + */ +function utf8_read_next_code_point( string $text, int $starting_byte, int &$state, int &$matched_bytes = null ): ?int { + /** + * State classification and transition table for UTF-8 validation. + * + * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ + static $state_table = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x0a\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03\x0b\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x00\x0c\x18\x24\x3c\x60\x54\x0c\x0c\x0c\x30\x48\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x00\x0c\x0c\x0c\x0c\x0c\x00\x0c\x00\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"; + + /** + * This branchless UTF-8 decoding algorithm computes the + * code point and validates in one efficient scan. + * + * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ + $end = strlen( $text ); + $code_point = 0; + + for ( $at = $starting_byte; $at < $end && ( $at === $starting_byte || 0 !== $state ); $at++ ) { + $byte = ord( $text[ $at ] ); + $classification = ord( $state_table[ $byte ] ); + + // Append continuation bits to code point or collect the first byte's bits. + $code_point = ( 0 === $state ) + ? ( ( 0xFF >> $classification ) & $byte ) + : ( ( $byte & 0x3F ) | ( $code_point << 6 ) ); + + $state = ord( $state_table[ 256 + $state + $classification ] ); + } + + $matched_bytes = $at - $starting_byte; + + return $code_point; +} + +/** + * Converts hexadecimal text into an integer value, or `null` if not able to decode. + * + * If unable to parse as many digits as provided into a valid int, this function will + * return `null`. This may be because of a failure to find proper digits or because + * of an integer overflow while performing the conversion. + * + * Example: + * + * 143 === wp_hex_to_int( "8F" ); + * null === wp_hex_to_int( "Train" ); + * + * // It's possible to decode inside an existing string without performing allocations. + * 194 === wp_hex_to_int( "Cats%c2%a0and%c2%a0Dogs", 5, 2 ); + * + * // There are only two hexademical digits in the given span, but three were requested. + * null === wp_hex_to_int( "%2e%a6", 1, 3 ); + * + * // Integers are limited by PHP_INT_MAX, so it's not possible to decode a bigger number. + * null === wp_hex_to_int( "FFFFFFFFFFFFFFFFFFF" ); + * + * @since {WP_VERSION} + * + * @param string $text Text containing span of hexadecimal digits to decode. + * @param int $starting_byte Optional. Starting byte offset into text where digits begin. + * Default is to start at the beginning of the given text. + * @param int $byte_length Optional. Byte-length of span of text containing hexadecimal + * digits. Default is to decode until the end of the given text. + * @return int|null Decoded integer if properly decoded, otherwise `null`. + */ +function wp_hex_to_int( string $text, int $starting_byte = 0, int $byte_length = null ) { + /** + * This table lookup provides for non-branching decoding of hex digits to integer values. + * + * The 0xFF value indicates that the character is not a hexadecimal digit, while any + * other value indicates the integer value that digit represents. For instance, the + * character in the 66th position, string-index 65, corresponding to U+41 "A", is "\x0A". + * + * Thus, a lookup for `ord("A")` will retrieve `\x0A` and `ord( "\x0A" )` is `10`. + */ + static $table = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + + $value = 0; + $end = isset( $byte_length ) ? $starting_byte + $byte_length : strlen( $text ); + $pre_max = PHP_INT_MAX >> 4; + + for ( $at = $starting_byte; $at < $end; $at++ ) { + $c = $text[ $at ]; + $nibble = ord( $table[ ord( $c ) ] ); + + /* + * Whether encountering something that isn't a hex digit or an integer overflow, + * this only returns properly-decoded integers. Return `null` for these cases. + */ + if ( 0xFF === $nibble || $value > $pre_max ) { + return null; + } + + $value <<= 4; + + /* + * This overflow must be caught before adding to the value, otherwise + * it would overflow and appear small by the time of this check. + */ + if ( $value > PHP_INT_MAX - $nibble ) { + return null; + } + + $value |= $nibble; + } + + return $value; +} + +/** + * Converts text content into a slug for display. + * + * Slugs are used as identifiers in contexts which are largely + * US ASCII letters and which should be easy to recognize and + * type, for example, in a blog post's permalink. + * + * Example: + * + * 'the-forest-for-the-trees' === slugify( 'The forest for the trees.' ); + * + * $slug = slugify( "This%c2%a0cannot–work¿ Even - for 5% »correctness«?" ); + * $slug === 'This-cannot-work-Even-for-5%-correctness?'; + * + * @since {WP_VERSION} + * + * @param string $sluggee Text content to convert into a slug. + * @return string Slugified version of given text content. + */ +function slugify( string $sluggee ): string { + $sluggee = WP_HTML_Decoder::decode_text_node( $sluggee ); + if ( function_exists( 'normalizer_normalize' ) ) { + $sluggee = normalizer_normalize( $sluggee, Normalizer::FORM_C ); + } + + $slug = ''; + $slug_code_points = 0; + $at = 0; + $end = strlen( $sluggee ); + $last = null; + + while ( $at < $end && $slug_code_points <= 200 ) { + $c = $sluggee[ $at ]; + + if ( "\x00" === $c ) { + ++$at; + goto combining_dash; // phpcs:disable + } + + // ASCII alphanumerics pass directly. + if ( + ( 'A' <= $c && 'Z' >= $c ) || + ( 'a' <= $c && 'z' >= $c ) || + ( '0' <= $c && '9' >= $c ) + ) { + $slug .= $c; + $last = $c; + ++$at; + ++$slug_code_points; + continue; + } + + $matched_bytes = 0; + $state = 0; + $code_point = utf8_read_next_code_point( $sluggee, $at, $state, $matched_bytes ); + $at += $matched_bytes; + $code_point_hex = dechex( $code_point ); + $char = substr( $sluggee, $at - $matched_bytes, $matched_bytes ); + + /* + * Replace invalid UTF-8 with a dash. + * + * Normally this would be replaced with U+FFFD (�) but this wouldn't + * work well as a slug. So in this case, to avoid joining strings that + * were separated, the dash is used as a safe fallback. + */ + if ( null === $code_point ) { + goto combining_dash; // phpcs:disable + } + + // Decode percent-escaped characters. + if ( '%' === $c && $at + 1 < $end ) { + $matched_bytes = 0; + $code_point = null; + $next_char = read_utf8_percent_escaped_code_point( $sluggee, $at - 1, $matched_bytes, $code_point ); + if ( isset( $next_char ) ) { + $at += $matched_bytes - 1; + } else { + $code_point = 0x25; + } + } + + // Allow dashy things. + $is_dashy = in_array( + $code_point, + array( + 0x20, // Space. + 0x2D, // Hyphen-minus. + 0x2E, // Full stop. + 0x2F, // Solidus. + 0xA0, // No-break space. + 0x2000, // En quad. + 0x2001, // Em quad. + 0x2002, // En space. + 0x2003, // Em space. + 0x2004, // Three-per-em space. + 0x2005, // Four-per-em space. + 0x2006, // Six-per-em space. + 0x2007, // Figure space. + 0x2008, // Punctuation space. + 0x2009, // Thin space. + 0x200A, // Hair space. + 0x2010, // Hyphen. + 0x2011, // Non-breaking hyphen. + 0x2012, // Figure dash. + 0x2013, // En dash. + 0x2014, // Em dash. + 0x2015, // Horizontal bar. + 0x2028, // Line separator. + 0x2029, // Paragraph separator. + 0x202F, // Narrow no-break space. + 0x2E3A, // Two-em dash. + 0x2E3B, // Three-em dash. + 0xFE58, // Small em dash. + 0xFE63, // Small hyphen-minus. + 0xFF0D, // Fullwidth hyphen-minus. + ), + true + ); + + if ( $is_dashy ) { + goto combining_dash; // phpcs:disable + } + + // Convert `×` (U+D7, "×") to 'x'. + if ( 0xD7 === $code_point ) { + $slug .= 'x'; + $last = 'x'; + ++$slug_code_points; + continue; + } + + $should_remove = in_array( + $code_point, + array( + 0x25, // Percent sign. + 0xA1, // Inverted exclamation mark. + 0xA9, // Copyright sign. + 0xAB, // Left-pointing double angle quotation mark. + 0xAD, // Soft hyphen. + 0xAE, // Registered sign. + 0xB0, // Degree sign. + 0xB4, // Acute accent. + 0xBB, // Right-pointing double angle quotation mark. + 0xBF, // Inverted question mark. + 0x02CA, // Modifier letter acute accent. + 0x0300, // Combining grave accent. + 0x0301, // Combining acute accent. + 0x0304, // Combining macron. + 0x030C, // Combining caron. + 0x0341, // Combining acute tone mark. + 0x200B, // Zero-width space. + 0x200C, // Zero-width non-joiner. + 0x200D, // Zero-width joiner. + 0x200E, // Left-to-right mark. + 0x200F, // Right-to-left mark. + 0x2018, // Left single quotation mark. + 0x2019, // Right single quotation mark. + 0x201A, // Single low-9 quotation mark. + 0x201B, // Single high-reversed-9 quotation mark. + 0x201C, // Left double quotation mark. + 0x201D, // Right double quotation mark. + 0x201E, // Double low-9 quotation mark. + 0x201F, // Double high-reversed-9 quotation mark. + 0x2022, // Bullet. + 0x2026, // Horizontal ellipsis. + 0x202A, // Left-to-right embedding. + 0x202B, // Right-to-left embedding. + 0x202C, // Pop directional formatting. + 0x202D, // Left-to-right override. + 0x202E, // Right-to-left override. + 0x2039, // Single left-pointing angle quotation mark. + 0x203A, // Single right-pointing angle quotation mark. + 0x2122, // Trade mark sign (sic). + 0xFEFF, // Byte order mark (Zero-width no-break space). + 0xFFFC, // Object-replacement character. + ), + true + ); + + if ( ! $should_remove ) { + $slug .= substr( $sluggee, $at - $matched_bytes, $matched_bytes ); + $last = ''; + ++$slug_code_points; + } + + continue; + + combining_dash: + if ( '-' !== $last ) { + $slug .= '-'; + $last = '-'; + ++$slug_code_points; + } + } + + return '-' === $last ? substr( $slug, 0, -1 ) : $slug; +} + /** * Sanitizes a title, replacing whitespace and a few other characters with dashes. * From 3b332b33e31bdad1355d2e9ca59e097ed89d1935 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 1 Jul 2024 13:50:07 -0700 Subject: [PATCH 3/5] fixup! WIP: Rely on Unicode processing within a RegExp pattern for title sanitization. --- src/wp-includes/formatting.php | 113 +++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 26 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 5616d388a5a20..8bd82f4e79c47 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2712,50 +2712,111 @@ function slugify( string $sluggee ): string { */ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) { $title = strip_tags( $title ); + // Preserve escaped octets. + $title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title ); + // Remove percent signs that are not part of an octet. + $title = str_replace( '%', '', $title ); + // Restore octets. + $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title ); if ( seems_utf8( $title ) ) { if ( function_exists( 'mb_strtolower' ) ) { $title = mb_strtolower( $title, 'UTF-8' ); } - $title = utf8_uri_encode( $title, 200 ); } $title = strtolower( $title ); if ( 'save' === $context ) { - $title = html_entity_decode( $title, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ); - $title = rawurldecode( $title ); - // Dash/hyphen symbols plus whitespace turn into a dash. - $title = preg_replace( '~[\p{Pd}\s/]+~u', '-', $title ); - // Control characters, combining marks, symbols, punctuation, and invisible characters are removed. - $title = preg_replace_callback( - '~-[_-]+|[\p{C}\p{M}\p{S}\p{P}\p{Z}]+~u', - static function ( $chunk ) { - switch ( $chunk[0] ) { - case '-': - return '-'; - - case '_': - return '_'; - - default: - return ''; - } - }, - $title - ); + // Convert  , &ndash, and &mdash to hyphens. + $title = str_replace( array( '%c2%a0', '%e2%80%93', '%e2%80%94' ), '-', $title ); + // Convert  , &ndash, and &mdash HTML entities to hyphens. + $title = str_replace( array( ' ', ' ', '–', '–', '—', '—' ), '-', $title ); + // Convert forward slash to hyphen. + $title = str_replace( '/', '-', $title ); + + // Strip these characters entirely. $title = str_replace( array( - "\xD7", // Replace multiplication sign "×" with "x". - '�', // Remove invalid decoded characters. + // Soft hyphens. + '%c2%ad', + // ¡ and ¿. + '%c2%a1', + '%c2%bf', + // Angle quotes. + '%c2%ab', + '%c2%bb', + '%e2%80%b9', + '%e2%80%ba', + // Curly quotes. + '%e2%80%98', + '%e2%80%99', + '%e2%80%9c', + '%e2%80%9d', + '%e2%80%9a', + '%e2%80%9b', + '%e2%80%9e', + '%e2%80%9f', + // Bullet. + '%e2%80%a2', + // ©, ®, °, &hellip, and &trade. + '%c2%a9', + '%c2%ae', + '%c2%b0', + '%e2%80%a6', + '%e2%84%a2', + // Acute accents. + '%c2%b4', + '%cb%8a', + '%cc%81', + '%cd%81', + // Grave accent, macron, caron. + '%cc%80', + '%cc%84', + '%cc%8c', + // Non-visible characters that display without a width. + '%e2%80%8b', // Zero width space. + '%e2%80%8c', // Zero width non-joiner. + '%e2%80%8d', // Zero width joiner. + '%e2%80%8e', // Left-to-right mark. + '%e2%80%8f', // Right-to-left mark. + '%e2%80%aa', // Left-to-right embedding. + '%e2%80%ab', // Right-to-left embedding. + '%e2%80%ac', // Pop directional formatting. + '%e2%80%ad', // Left-to-right override. + '%e2%80%ae', // Right-to-left override. + '%ef%bb%bf', // Byte order mark. + '%ef%bf%bc', // Object replacement character. ), + '', + $title + ); + + // Convert non-visible characters that display with a width to hyphen. + $title = str_replace( array( - 'x', - '', + '%e2%80%80', // En quad. + '%e2%80%81', // Em quad. + '%e2%80%82', // En space. + '%e2%80%83', // Em space. + '%e2%80%84', // Three-per-em space. + '%e2%80%85', // Four-per-em space. + '%e2%80%86', // Six-per-em space. + '%e2%80%87', // Figure space. + '%e2%80%88', // Punctuation space. + '%e2%80%89', // Thin space. + '%e2%80%8a', // Hair space. + '%e2%80%a8', // Line separator. + '%e2%80%a9', // Paragraph separator. + '%e2%80%af', // Narrow no-break space. ), + '-', $title ); + + // Convert × to 'x'. + $title = str_replace( '%c3%97', 'x', $title ); } // Remove HTML entities. From 19cf620f186f571014ba9440a741ee9458e96512 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 1 Jul 2024 13:51:50 -0700 Subject: [PATCH 4/5] fixup! Build fully Unicode-aware slugifier. --- src/wp-includes/formatting.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 8bd82f4e79c47..f95404bffd901 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2552,8 +2552,6 @@ function slugify( string $sluggee ): string { $state = 0; $code_point = utf8_read_next_code_point( $sluggee, $at, $state, $matched_bytes ); $at += $matched_bytes; - $code_point_hex = dechex( $code_point ); - $char = substr( $sluggee, $at - $matched_bytes, $matched_bytes ); /* * Replace invalid UTF-8 with a dash. From e13d4411df4763ae8c98ff22693aa8994092d199 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 1 Jul 2024 13:52:54 -0700 Subject: [PATCH 5/5] Call slugify --- src/wp-includes/formatting.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index f95404bffd901..f2d3aa4de3b8c 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2709,6 +2709,10 @@ function slugify( string $sluggee ): string { * @return string The sanitized title. */ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) { + if ( 'display' === $context ) { + return slugify( $title ); + } + $title = strip_tags( $title ); // Preserve escaped octets. $title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title );