From d154bea3ff48e9966ffc3b560a4946883b3a327e Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Sun, 23 Jun 2024 23:22:26 +0100
Subject: [PATCH 1/3] Introduce custom UTF-8 decoding pipeline.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WordPress relies on various extensions, regular expressions, and basic
string operations when working with text potentially encoded as UTF-8.

In this patch an efficient UTF-8 decoding pipeline is introduced which
can remove these dependencies, normalize all decoding behaviors, and
open up new kinds of processing opportunities.

The decoder was taken from [Björn Höhrmann]. While it may be possible
that other methods are more efficient, such as in the multi-byte
extension, this decoder provides a streamable interface useful for
more flexible kinds of processing: for example, whether or not to
replace invalid byte sequences, zero-memory-overhead code point
counting, and partially decoding strings.

[Björn Höhrmann]: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
---
 src/wp-includes/formatting.php                | 451 +++++++++++++-----
 src/wp-includes/functions.php                 | 148 ++++++
 .../tests/formatting/utf8UriEncode.php        |  18 +-
 .../phpunit/tests/formatting/wpTrimWords.php  |   2 +-
 4 files changed, 492 insertions(+), 127 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 2068e5f3a70fb..2940bec0dadce 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -869,49 +869,17 @@ function shortcode_unautop( $text ) {
 }
 
 /**
- * Checks to see if a string is utf8 encoded.
- *
- * NOTE: This function checks for 5-Byte sequences, UTF8
- *       has Bytes Sequences with a maximum length of 4.
+ * Indicates if a given string represents valid UTF-8 bytes.
  *
  * @author bmorel at ssi dot fr (modified)
  * @since 1.2.1
+ * @since {WP_VERSION} Relies on custom decoder and no longer accepts invalid 5-byte UTF-8 sequences.
  *
- * @param string $str The string to be checked
- * @return bool True if $str fits a UTF-8 model, false otherwise.
+ * @param string $text Might represent valid UTF-8 bytes.
+ * @return bool Whether the text represents a valid UTF-8 byte stream.
  */
-function seems_utf8( $str ) {
-	mbstring_binary_safe_encoding();
-	$length = strlen( $str );
-	reset_mbstring_encoding();
-
-	for ( $i = 0; $i < $length; $i++ ) {
-		$c = ord( $str[ $i ] );
-
-		if ( $c < 0x80 ) {
-			$n = 0; // 0bbbbbbb
-		} elseif ( ( $c & 0xE0 ) === 0xC0 ) {
-			$n = 1; // 110bbbbb
-		} elseif ( ( $c & 0xF0 ) === 0xE0 ) {
-			$n = 2; // 1110bbbb
-		} elseif ( ( $c & 0xF8 ) === 0xF0 ) {
-			$n = 3; // 11110bbb
-		} elseif ( ( $c & 0xFC ) === 0xF8 ) {
-			$n = 4; // 111110bb
-		} elseif ( ( $c & 0xFE ) === 0xFC ) {
-			$n = 5; // 1111110b
-		} else {
-			return false; // Does not match any model.
-		}
-
-		for ( $j = 0; $j < $n; $j++ ) { // n bytes matching 10bbbbbb follow ?
-			if ( ( ++$i === $length ) || ( ( ord( $str[ $i ] ) & 0xC0 ) !== 0x80 ) ) {
-				return false;
-			}
-		}
-	}
-
-	return true;
+function seems_utf8( $text ) {
+	return utf8_is_valid_byte_stream( $text );
 }
 
 /**
@@ -1084,12 +1052,39 @@ function wp_specialchars_decode( $text, $quote_style = ENT_NOQUOTES ) {
 }
 
 /**
- * Checks for invalid UTF8 in a string.
+ * Returns a UTF-8 validated string, taking into account the blog charset.
+ *
+ * This function only processes its input text if the blog charset is set to UTF-8.
+ * When the blog charset is anything else, it will always short-circuit and return
+ * the unmodified input text.
+ *
+ * For UTF-8 blogs, this will always return the input text if it validates. But if
+ * it doesn't validate, the behavior depends on the value of `$strip`.
+ *
+ *  - If instructed not to strip invalid bytes, then an empty string will be returned.
+ *  - If instructed to strip invalid bytes, the portions of the string which are valid
+ *    will be returned and the invalid portions will be removed.
+ *
+ * Example:
+ *
+ *     'Hello, World! 🌎' === wp_check_invalid_utf8( 'Hello, World! 🌎' );
+ *
+ *     ''                         === wp_check_invalid_utf8( "Latin1 is n\xF6t valid UTF-8." );
+ *     'Latin1 is nt valid UTF-8' === wp_check_invalid_utf8( "Latin1 is n\xF6t valid UTF-8.", true );
+ *
+ *     '' === wp_check_invalid_utf8( "Surrogate halves like '\xDE\xA0\x80' are not permitted." );
+ *     $stripped = wp_check_invalid_utf8( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", true );
+ *     $stripped === 'Surrogate halves like '' are not permitted.';
+ *
+ *     '' === wp_check_invalid_utf8( "Broken stream: \xC2\xC2" );
+ *     'Broken stream: ' === wp_check_invalid_utf8( "Broken stream: \xC2\xC2", true );
  *
  * @since 2.8.0
+ * @since {WP_VERSION} Relies on custom UTF-8 decoder to normalize behavior across environments.
  *
  * @param string $text   The text which is to be checked.
  * @param bool   $strip  Optional. Whether to attempt to strip out invalid UTF8. Default false.
+ *
  * @return string The checked text.
  */
 function wp_check_invalid_utf8( $text, $strip = false ) {
@@ -1108,94 +1103,178 @@ function wp_check_invalid_utf8( $text, $strip = false ) {
 		return $text;
 	}
 
-	// Check for support for utf8 in the installed PCRE library once and store the result in a static.
-	static $utf8_pcre = null;
-	if ( ! isset( $utf8_pcre ) ) {
-		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
-		$utf8_pcre = @preg_match( '/^./u', 'a' );
-	}
-	// We can't demand utf8 in the PCRE installation, so just return the string in those cases.
-	if ( ! $utf8_pcre ) {
-		return $text;
+	if ( false === $strip ) {
+		return utf8_is_valid_byte_stream( $text ) ? $text : '';
 	}
 
-	// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text.
-	if ( 1 === @preg_match( '/^./us', $text ) ) {
+	// If the entire string is valid don't bother setting up the error-removal loop below.
+	if ( utf8_is_valid_byte_stream( $text, 0, $error_byte_at ) ) {
 		return $text;
 	}
 
-	// Attempt to strip the bad chars if requested (not recommended).
-	if ( $strip && function_exists( 'iconv' ) ) {
-		return iconv( 'utf-8', 'utf-8', $text );
+	$buffer = '';
+	$at     = 0;
+	$end    = strlen( $text );
+
+	while ( $at < $end ) {
+		/*
+		 * If there are errors in the byte stream, they need to be skipped.
+		 * Append the next chunk from the text into the buffer, then jump to
+		 * the next character that could potentially start a new code point.
+		 */
+		$buffer .= substr( $text, $at, $error_byte_at - $at );
+		$at      = $error_byte_at + 1;
+		while ( $at < $end ) {
+			if ( $text[ $at ] < "\x80" ) {
+				break;
+			}
+			++$at;
+		}
+
+		if ( utf8_is_valid_byte_stream( $text, $at, $error_byte_at ) ) {
+			$buffer .= substr( $text, $at );
+			break;
+		}
 	}
 
-	return '';
+	return $buffer;
 }
 
 /**
  * Encodes the Unicode values to be used in the URI.
  *
+ * Note that invalid UTF-8 data will be transparently passed to the encoded URL!
+ *
  * @since 1.5.0
  * @since 5.8.3 Added the `encode_ascii_characters` parameter.
+ * @since {WP_VERSION} Optimized to minimize string allocations and concatenations.
  *
- * @param string $utf8_string             String to encode.
- * @param int    $length                  Max length of the string
- * @param bool   $encode_ascii_characters Whether to encode ascii characters such as < " '
- * @return string String with Unicode encoded for URI.
+ * @param string $utf8_string             Valid UTF-8 byte string to encode.
+ * @param int    $max_byte_length         Max byte length of the returned string.
+ * @param bool   $encode_ascii_characters Whether to apply RFC 3986 encoding to ASCII bytes.
+ * @return string Encoded URI string.
  */
-function utf8_uri_encode( $utf8_string, $length = 0, $encode_ascii_characters = false ) {
-	$unicode        = '';
-	$values         = array();
-	$num_octets     = 1;
-	$unicode_length = 0;
+function utf8_uri_encode( $utf8_string, $max_byte_length = 0, $encode_ascii_characters = false ) {
+	if ( '' === $utf8_string ) {
+		return $utf8_string;
+	}
 
 	mbstring_binary_safe_encoding();
-	$string_length = strlen( $utf8_string );
-	reset_mbstring_encoding();
 
-	for ( $i = 0; $i < $string_length; $i++ ) {
+	$end        = strlen( $utf8_string );
+	$buffer     = '';
+	$max_length = 0 === $max_byte_length ? PHP_INT_MAX : $max_byte_length;
 
-		$value = ord( $utf8_string[ $i ] );
-
-		if ( $value < 128 ) {
-			$char                = chr( $value );
-			$encoded_char        = $encode_ascii_characters ? rawurlencode( $char ) : $char;
-			$encoded_char_length = strlen( $encoded_char );
-			if ( $length && ( $unicode_length + $encoded_char_length ) > $length ) {
+	if ( true ) {
+		/*
+		 * If not escaping the ASCII characters, alternate between flushing out
+		 * pure ASCII (and non-percent-sign) characters with escaping the range
+		 * of bytes that will all be escaped.
+		 *
+		 * This will eliminate as many string allocations and concatenations as
+		 * is possible while stopping as soon as the limit has been reached.
+		 */
+		$at = 0;
+		while ( $at < $end ) {
+			if ( strlen( $buffer ) >= $max_length ) {
 				break;
 			}
-			$unicode        .= $encoded_char;
-			$unicode_length += $encoded_char_length;
-		} else {
-			if ( count( $values ) === 0 ) {
-				if ( $value < 224 ) {
-					$num_octets = 2;
-				} elseif ( $value < 240 ) {
-					$num_octets = 3;
-				} else {
-					$num_octets = 4;
-				}
-			}
 
-			$values[] = $value;
+			// Flush ASCII byte ranges.
+			$was_at = $at;
+			$max_at = min( $end, $at + $max_length - strlen( $buffer ) );
+			while ( $at < $max_at && $utf8_string[ $at ] < "\x80" && '%' !== $utf8_string[ $at ] ) {
+				++$at;
+			}
 
-			if ( $length && ( $unicode_length + ( $num_octets * 3 ) ) > $length ) {
-				break;
+			if ( $at > $was_at ) {
+				$chunk   = substr( $utf8_string, $was_at, $at - $was_at );
+				$buffer .= $encode_ascii_characters ? rawurlencode( $chunk ) : $chunk;
 			}
-			if ( count( $values ) === $num_octets ) {
-				for ( $j = 0; $j < $num_octets; $j++ ) {
-					$unicode .= '%' . dechex( $values[ $j ] );
-				}
 
-				$unicode_length += $num_octets * 3;
+			if ( $at + 2 < $max_at && '%' === $utf8_string[ $at ] ) {
+				$high = $utf8_string[ $at + 1 ];
+				$low  = $utf8_string[ $at + 2 ];
+
+				if ( ctype_xdigit( $high ) && ctype_xdigit( $low ) ) {
+					$buffer .= substr( $utf8_string, $at, 3 );
+					$at     += 3;
+					continue;
+				}
+			}
 
-				$values     = array();
-				$num_octets = 1;
+			// Escape the next chunk.
+			$was_at = $at;
+			$max_at = min( $end, $at + $max_length - strlen( $buffer ) );
+			while ( $at < $max_at && ( $utf8_string[ $at ] >= "\x80" || '%' === $utf8_string[ $at ] ) ) {
+				++$at;
 			}
+
+			$buffer .= rawurlencode( substr( $utf8_string, $was_at, $at - $was_at ) );
 		}
 	}
 
-	return $unicode;
+	// If it fits then nothing else needs to be done.
+	if ( strlen( $buffer ) <= $max_length ) {
+		reset_mbstring_encoding();
+
+		return $buffer;
+	}
+
+	/*
+	 * If the buffer is too long, it will be necessary to truncate
+	 * at the nearest boundary before the limit.
+	 *
+	 * If there's no escaped byte within distance of the max length
+	 * then it's safe to truncate the buffer at the max length.
+	 *
+	 * If there is an escaped byte, however, it's important to not
+	 * only back up to before the escaped byte, but also to the start
+	 * of the UTF-8 code point that the escaped byte is a part of.
+	 */
+
+	if (
+		'%' !== $buffer[ max( 0, $max_length - 1 ) ] &&
+		'%' !== $buffer[ max( 0, $max_length - 2 ) ] &&
+		'%' !== $buffer[ max( 0, $max_length - 3 ) ]
+	) {
+		reset_mbstring_encoding();
+
+		return substr( $buffer, 0, $max_length );
+	}
+
+	// Find the first boundary which could represent an initial UTF-8 byte.
+	$at = $max_length;
+	while ( $at >= 0 ) {
+		// `strrpos()` needs a negative offset to perform right-to-left searching.
+		$at = strrpos( $buffer, '%', $at - strlen( $buffer ) );
+
+		// For some invalid UTF-8 byte sequences, this could happen - there is no boundary.
+		if ( false === $at ) {
+			return '';
+		}
+
+		$high_bits = intval( $buffer[ $at + 1 ], 16 );
+
+		// Start bytes are either ASCII (0xxx xxxx) or (110x xxxx) or (1110 xxxx) or (1111 0xxx).
+		if (
+			$high_bits < 0x8 ||
+			( 0xC === ( $high_bits & 0xE ) ) ||
+			( 0xE === ( $high_bits & 0xF ) ) ||
+			( 0xF === $high_bits && 0x8 > intval( $buffer[ $at + 2 ], 16 ) )
+		) {
+			reset_mbstring_encoding();
+
+			return substr( $buffer, 0, $at );
+		}
+
+		--$at;
+	}
+
+	// This should not be reachable.
+	reset_mbstring_encoding();
+
+	return $buffer;
 }
 
 /**
@@ -2011,6 +2090,7 @@ function remove_accents( $text, $locale = '' ) {
  * filename that is allowed to be uploaded.
  *
  * @since 2.1.0
+ * @since {WP_VERSION} Relies on custom UTF-8 decoder to remove dependency on PCRE.
  *
  * @param string $filename The filename to be sanitized.
  * @return string The sanitized filename.
@@ -2021,22 +2101,14 @@ function sanitize_file_name( $filename ) {
 
 	$special_chars = array( '?', '[', ']', '/', '\\', '=', '<', '>', ':', ';', ',', "'", '"', '&', '$', '#', '*', '(', ')', '|', '~', '`', '!', '{', '}', '%', '+', '’', '«', '»', '”', '“', chr( 0 ) );
 
-	// Check for support for utf8 in the installed PCRE library once and store the result in a static.
-	static $utf8_pcre = null;
-	if ( ! isset( $utf8_pcre ) ) {
-		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
-		$utf8_pcre = @preg_match( '/^./u', 'a' );
-	}
-
 	if ( ! seems_utf8( $filename ) ) {
 		$_ext     = pathinfo( $filename, PATHINFO_EXTENSION );
 		$_name    = pathinfo( $filename, PATHINFO_FILENAME );
 		$filename = sanitize_title_with_dashes( $_name ) . '.' . $_ext;
 	}
 
-	if ( $utf8_pcre ) {
-		$filename = preg_replace( "#\x{00a0}#siu", ' ', $filename );
-	}
+	// Replace non-breaking space with a normal space (U+A0 === 0xC2 0xA2 in UTF-8).
+	$filename = str_replace( "\xC2\xA0", ' ', $filename );
 
 	/**
 	 * Filters the list of characters to remove from a filename.
@@ -2120,6 +2192,7 @@ function sanitize_file_name( $filename ) {
  * for the {@see 'sanitize_user'} filter.
  *
  * @since 2.0.0
+ * @since {WP_VERSION} Relies on HTML API for decoding character references.
  *
  * @param string $username The username to be sanitized.
  * @param bool   $strict   Optional. If set to true, limits $username to specific characters.
@@ -2761,14 +2834,17 @@ function format_to_edit( $content, $rich_text = false ) {
  * and the size of the number. If the number is large enough, then no zeros will
  * be appended.
  *
+ * @deprecated {WP_VERSION} Use str_pad() instead.
+ *
  * @since 0.71
+ * @since {WP_VERISON} Replaced sprintf() with str_pad().
  *
  * @param int $number     Number to append zeros to if not greater than threshold.
  * @param int $threshold  Digit places number needs to be to not have zeros added.
  * @return string Adds leading zeros to number if needed.
  */
 function zeroise( $number, $threshold ) {
-	return sprintf( '%0' . $threshold . 's', $number );
+	return str_pad( (string) $number, $threshold, '0', STR_PAD_LEFT );
 }
 
 /**
@@ -4048,26 +4124,151 @@ function wp_trim_words( $text, $num_words = 55, $more = null ) {
 		$more = __( '&hellip;' );
 	}
 
-	$original_text = $text;
-	$text          = wp_strip_all_tags( $text );
-	$num_words     = (int) $num_words;
+	if ( str_starts_with( wp_get_word_count_type(), 'characters' ) && is_utf8_charset() ) {
+		$at        = 0;
+		$output    = '';
+		$buffer    = '';
+		$length    = 0;
+		$processor = new WP_HTML_Tag_Processor( $text );
+		while ( $processor->next_token() && $length <= $num_words ) {
+			switch ( $processor->get_token_name() ) {
+				case 'BR':
+					$buffer .= "\n";
+					continue 2;
 
-	if ( str_starts_with( wp_get_word_count_type(), 'characters' ) && preg_match( '/^utf\-?8$/i', get_option( 'blog_charset' ) ) ) {
-		$text = trim( preg_replace( "/[\n\r\t ]+/", ' ', $text ), ' ' );
-		preg_match_all( '/./u', $text, $words_array );
-		$words_array = array_slice( $words_array[0], 0, $num_words + 1 );
-		$sep         = '';
-	} else {
-		$words_array = preg_split( "/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY );
-		$sep         = ' ';
+				case '#text':
+					break;
+
+				default:
+					continue 2;
+			}
+
+			$buffer .= $processor->get_modifiable_text();
+			$end     = strlen( $buffer );
+
+			while ( $at < $end && $length <= $num_words ) {
+				// Skip whitespace.
+				$at += strspn( $buffer, " \t\f\r\n", $at );
+				if ( $at >= $end ) {
+					$at     = 0;
+					$buffer = '';
+					continue 2;
+				}
+
+				$text_span = strcspn( $buffer, " \t\f\r\n", $at );
+				if ( 0 === $text_span ) {
+					continue 2;
+				}
+
+				// Start decoding code points.
+				$state = UTF8_DECODER_ACCEPT;
+				$i_end = $at + $text_span;
+				for ( $i = $at; $i < $i_end; $i++ ) {
+					$state = utf8_decoder_apply_byte( $buffer[ $i ], $state );
+
+					// @todo Skip over invalid UTF-8 bytes and do not copy them.
+					if ( UTF8_DECODER_REJECT === $state ) {
+						$at = $i;
+						continue 2;
+					}
+
+					if ( UTF8_DECODER_ACCEPT !== $state ) {
+						continue;
+					}
+
+					++$length;
+
+					if ( $length === $num_words ) {
+						$output .= substr( $buffer, $at, $i - $at );
+						continue;
+					}
+
+					if ( $length > $num_words ) {
+						$output .= $more;
+						break 3;
+					}
+				}
+			}
+		}
+
+		/**
+		 * Filters the text content after words have been trimmed.
+		 *
+		 * @since 3.3.0
+		 *
+		 * @param string $text          The trimmed text.
+		 * @param int    $num_words     The number of words to trim the text to. Default 55.
+		 * @param string $more          An optional string to append to the end of the trimmed text, e.g. &hellip;.
+		 * @param string $original_text The text before it was trimmed.
+		 */
+		return apply_filters( 'wp_trim_words', $output, $num_words, $more, $text );
 	}
 
-	if ( count( $words_array ) > $num_words ) {
-		array_pop( $words_array );
-		$text = implode( $sep, $words_array );
-		$text = $text . $more;
-	} else {
-		$text = implode( $sep, $words_array );
+	$at         = 0;
+	$output     = '';
+	$buffer     = '';
+	$word_count = 0;
+	$processor  = new WP_HTML_Tag_Processor( $text );
+	while ( $processor->next_token() && $word_count < $num_words ) {
+		switch ( $processor->get_token_name() ) {
+			case 'BR':
+				$buffer .= "\n";
+				continue 2;
+
+			case '#text':
+				break;
+
+			default:
+				continue 2;
+		}
+
+		$buffer .= $processor->get_modifiable_text();
+		$end     = strlen( $buffer );
+
+		while ( $at < $end && $word_count < $num_words ) {
+			// Skip whitespace.
+			$at += strspn( $buffer, " \t\f\r\n", $at );
+			if ( $at >= $end ) {
+				$at     = 0;
+				$buffer = '';
+				continue 2;
+			}
+
+			$word_length = strcspn( $buffer, " \t\f\r\n", $at );
+			if ( 0 === $word_length ) {
+				continue 2;
+			}
+
+			if ( $at + $word_length >= $end ) {
+				$buffer = substr( $buffer, $at );
+				$at     = 0;
+				continue 2;
+			}
+
+			++$word_count;
+			if ( $word_count > 1 ) {
+				$output .= ' ';
+			}
+			$output .= substr( $buffer, $at, $word_length );
+			$at     += $word_length;
+		}
+	}
+
+	// A final word may have crossed the last token boundary.
+	if ( $at < strlen( $buffer ) && $word_count < $num_words ) {
+		$at         += strspn( $buffer, " \t\f\r\n", $at );
+		$word_length = strcspn( $buffer, " \t\f\r\n", $at );
+		if ( $word_length > 0 ) {
+			++$word_count;
+			if ( $word_count > 1 ) {
+				$output .= ' ';
+			}
+			$output .= substr( $buffer, $at, $word_length );
+		}
+	}
+
+	if ( $word_count >= $num_words ) {
+		$output .= $more;
 	}
 
 	/**
@@ -4080,7 +4281,7 @@ function wp_trim_words( $text, $num_words = 55, $more = null ) {
 	 * @param string $more          An optional string to append to the end of the trimmed text, e.g. &hellip;.
 	 * @param string $original_text The text before it was trimmed.
 	 */
-	return apply_filters( 'wp_trim_words', $text, $num_words, $more, $original_text );
+	return apply_filters( 'wp_trim_words', $output, $num_words, $more, $text );
 }
 
 /**
diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php
index 32d6739518b00..d74af18275da0 100644
--- a/src/wp-includes/functions.php
+++ b/src/wp-includes/functions.php
@@ -7549,6 +7549,154 @@ function _canonical_charset( $charset ) {
 	return $charset;
 }
 
+if ( ! defined( 'UTF8_DECODER_ACCEPT' ) ) {
+	define( 'UTF8_DECODER_ACCEPT', 0 );
+}
+
+if ( ! defined( 'UTF8_DECODER_REJECT' ) ) {
+	define( 'UTF8_DECODER_REJECT', 1 );
+}
+
+/**
+ * Indicates if a given byte stream represents valid UTF-8.
+ *
+ * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
+ *
+ * Example:
+ *
+ *     true  === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
+ *
+ *     false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
+ *     12    === $error_at;
+ *
+ *     false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
+ *     23    === $error_at;
+ *
+ *     false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
+ *     15    === $error_at;
+ *
+ * @since {WP_VERSION}
+ *
+ * @param string   $bytes               Text to validate as UTF-8 bytes.
+ * @param int      $starting_byte       Byte offset in string where decoding should begin.
+ * @param int|null $first_error_byte_at Optional. If provided and byte stream fails to validate,
+ *                                      will be set to the byte offset where the first invalid
+ *                                      byte appeared. Otherwise, will not be set.
+ * @return bool Whether the given byte stream represents valid UTF-8.
+ */
+function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, int &$first_error_byte_at = null ): bool {
+	$state         = UTF8_DECODER_ACCEPT;
+	$last_start_at = $starting_byte;
+
+	mbstring_binary_safe_encoding();
+
+	for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			$last_start_at = $at;
+		}
+
+		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
+	}
+
+	reset_mbstring_encoding();
+
+	if ( UTF8_DECODER_ACCEPT === $state ) {
+		return true;
+	} else {
+		$first_error_byte_at = $last_start_at;
+		return false;
+	}
+}
+
+/**
+ * Returns number of code points found within a UTF-8 string, similar to `strlen()`.
+ *
+ * If the byte stream fails to properly decode as UTF-8 this function will set the
+ * byte index of the first error byte and report the number of decoded code points.
+ *
+ * @since {WP_VERSION}
+ *
+ * @param string   $bytes               Text for which to count code points.
+ * @param int|null $first_error_byte_at Optional. If provided, will be set upon finding
+ *                                      the first invalid byte.
+ * @return int How many code points were decoded in the given byte stream before an error
+ *             or before reaching the end of the string.
+ */
+function utf8_code_point_count( string $bytes, int &$first_error_byte_at = null ): int {
+	$state         = UTF8_DECODER_ACCEPT;
+	$last_start_at = 0;
+	$count         = 0;
+	$code_point    = 0;
+
+	mbstring_binary_safe_encoding();
+
+	for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			$last_start_at = $at;
+		}
+
+		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $code_point );
+
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			++$count;
+		}
+	}
+
+	reset_mbstring_encoding();
+
+	if ( UTF8_DECODER_ACCEPT !== $state ) {
+		$first_error_byte_at = $last_start_at;
+	}
+
+	return $count;
+}
+
+/**
+ * Inner loop for a number of UTF-8 decoding-related functions.
+ *
+ * You probably don't need this! This is highly-specific and optimized
+ * code for UTF-8 operations used in other functions.
+ *
+ * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ *
+ * @since {WP_VERSION}
+ *
+ * @access private
+ *
+ * @param string   $byte       Next byte to be applied in UTF-8 decoding or validation.
+ * @param int      $state      UTF-8 decoding state, one of the following values:<br>
+ *                             `UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
+ *                             `UTF8_DECODER_REJECT`: An error has occurred.<br>
+ *                             Any other positive value: Decoder is waiting for additional bytes.
+ * @param int|null $code_point Optional. If provided, will accumulate the decoded code point as
+ *                             each byte is processed. If not provided or unable to decode, will
+ *                             not be set, or will be set to invalid and unusable data.
+ * @return int Next decoder state after processing the current byte.
+ */
+function utf8_decoder_apply_byte( string $byte, int $state, int &$code_point = null ): int {
+	/**
+	 * State classification and transition table for UTF-8 validation.
+	 *
+	 * > The first part of the table maps bytes to character classes that
+	 * > to reduce the size of the transition table and create bitmasks.
+	 * >
+	 * > The second part is a transition table that maps a combination
+	 * > of a state of the automaton and a character class to a state.
+	 *
+	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+	 */
+	static $state_table = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x0a\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03\x0b\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x00\x0c\x18\x24\x3c\x60\x54\x0c\x0c\x0c\x30\x48\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x00\x0c\x0c\x0c\x0c\x0c\x00\x0c\x00\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c";
+
+	$byte_value = ord( $byte );
+
+	$classification = ord( $state_table[ $byte_value ] );
+	$code_point     = ( UTF8_DECODER_ACCEPT === $state )
+		? ( ( 0xFF >> $classification ) & $byte_value )
+		: ( ( $byte_value & 0x3F ) | ( $code_point << 6 ) );
+
+	return ord( $state_table[ 256 + $state + $classification ] );
+}
+
 /**
  * Sets the mbstring internal encoding to a binary safe encoding when func_overload
  * is enabled.
diff --git a/tests/phpunit/tests/formatting/utf8UriEncode.php b/tests/phpunit/tests/formatting/utf8UriEncode.php
index cb6f513c8edef..c1598e787af72 100644
--- a/tests/phpunit/tests/formatting/utf8UriEncode.php
+++ b/tests/phpunit/tests/formatting/utf8UriEncode.php
@@ -12,9 +12,25 @@ class Tests_Formatting_Utf8UriEncode extends WP_UnitTestCase {
 	 * are dealt with elsewhere.
 	 *
 	 * @dataProvider data
+	 *
+	 * @param string $utf8       String encoded in UTF-8 bytes.
+	 * @param string $urlencoded Expected percent-escaped form of input text.
 	 */
 	public function test_percent_encodes_non_reserved_characters( $utf8, $urlencoded ) {
-		$this->assertSame( $urlencoded, utf8_uri_encode( $utf8 ) );
+		/**
+		 * Casing of percent-encoding shouldn't matter; upper-case is nominal.
+		 *
+		 * @see https://url.spec.whatwg.org/#percent-encoded-bytes
+		 */
+		$comparable = preg_replace_callback(
+			'~%[A-F0-9]{2}~',
+			static function ( $escaped_match ) {
+				return strtolower( $escaped_match[0] );
+			},
+			utf8_uri_encode( $utf8 )
+		);
+
+		$this->assertSame( $urlencoded, $comparable );
 	}
 
 	/**
diff --git a/tests/phpunit/tests/formatting/wpTrimWords.php b/tests/phpunit/tests/formatting/wpTrimWords.php
index b918fd1642b4b..fb3063ef8f9a3 100644
--- a/tests/phpunit/tests/formatting/wpTrimWords.php
+++ b/tests/phpunit/tests/formatting/wpTrimWords.php
@@ -50,7 +50,7 @@ public function test_strips_script_and_style_content() {
 		$this->assertSame( $trimmed, wp_trim_words( $text ) );
 	}
 
-	public function test_doesnt_trim_short_text() {
+	public function test_does_not_trim_short_text() {
 		$text = 'This is some short text.';
 		$this->assertSame( $text, wp_trim_words( $text ) );
 	}

From e6b7b797583d9279603a0e4e44ae2c8c11b5099e Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Mon, 23 Sep 2024 19:21:21 -0700
Subject: [PATCH 2/3] Fix state table and some decoding logic.

---
 src/wp-includes/functions.php | 95 +++++++++++++++++++++++++++++++----
 1 file changed, 84 insertions(+), 11 deletions(-)

diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php
index 23fc505724d31..43fb5849baec6 100644
--- a/src/wp-includes/functions.php
+++ b/src/wp-includes/functions.php
@@ -7670,16 +7670,16 @@ function utf8_code_point_count( string $bytes, int &$first_error_byte_at = null
  * @access private
  *
  * @param string   $byte       Next byte to be applied in UTF-8 decoding or validation.
- * @param int      $state      UTF-8 decoding state, one of the following values:<br>
- *                             `UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
- *                             `UTF8_DECODER_REJECT`: An error has occurred.<br>
+ * @param int      $state      UTF-8 decoding state, one of the following values:<br><ul>
+ *                             <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
+ *                             <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
  *                             Any other positive value: Decoder is waiting for additional bytes.
  * @param int|null $code_point Optional. If provided, will accumulate the decoded code point as
  *                             each byte is processed. If not provided or unable to decode, will
  *                             not be set, or will be set to invalid and unusable data.
  * @return int Next decoder state after processing the current byte.
  */
-function utf8_decoder_apply_byte( string $byte, int $state, int &$code_point = null ): int {
+function utf8_decoder_apply_byte( string $byte, int $state, int &$code_point = 0 ): int {
 	/**
 	 * State classification and transition table for UTF-8 validation.
 	 *
@@ -7691,16 +7691,89 @@ function utf8_decoder_apply_byte( string $byte, int $state, int &$code_point = n
 	 *
 	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 	 */
-	static $state_table = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x0a\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03\x0b\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x00\x0c\x18\x24\x3c\x60\x54\x0c\x0c\x0c\x30\x48\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x00\x0c\x0c\x0c\x0c\x0c\x00\x0c\x00\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x18\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x24\x0c\x24\x0c\x0c\x0c\x24\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c";
+	static $state_table = (
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
+		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
+		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
+		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
+		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
+		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
+		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+	);
+
+	$byte       = ord( $byte );
+	$type       = ord( $state_table[ $byte ] );
+	$code_point = ( UTF8_DECODER_ACCEPT === $state )
+		? ( ( 0xFF >> $type ) & $byte )
+		: ( ( $byte & 0x3F ) | ( $code_point << 6 ) );
+
+	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
+}
+
+/**
+ * Extract a slice of a text by code point, where invalid byte seuqences count
+ * as a single code point, U+FFFD (the Unicode replacement character `�`).
+ *
+ * This function does not permit passing negative indices and will return
+ * the original string if such are provide.
+ *
+ * @param string $text   Input text from which to extract.
+ * @param int    $from   Start extracting after this many code-points.
+ * @param int    $length Extract this many code points.
+ *
+ * @return string Extracted slice of input string.
+ */
+function utf8_substr( string $text, int $from = 0, int $length = null ): string {
+	if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
+		return $text;
+	}
 
-	$byte_value = ord( $byte );
+	$position_in_input  = 0;
+	$code_point_at      = 0;
+	$end_byte           = strlen( $text );
+	$buffer             = '';
+	$seen_code_points   = 0;
+	$sliced_code_points = 0;
+	$decoder_state      = UTF8_DECODER_ACCEPT;
 
-	$classification = ord( $state_table[ $byte_value ] );
-	$code_point     = ( UTF8_DECODER_ACCEPT === $state )
-		? ( ( 0xFF >> $classification ) & $byte_value )
-		: ( ( $byte_value & 0x3F ) | ( $code_point << 6 ) );
+	// Get to the start of the string.
+	while ( $position_in_input < $end_byte && $seen_code_points < $length ) {
+		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
+
+		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
+			++$position_in_input;
+
+			if ( $seen_code_points >= $from ) {
+				++$sliced_code_points;
+				$buffer .= substr( $text, $code_point_at, $position_in_input - $code_point_at );
+			}
+
+			++$seen_code_points;
+			$code_point_at = $position_in_input;
+		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
+			$buffer .= "\u{FFFD}";
+
+			// Skip to the start of the next code point.
+			while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
+				$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
+			}
+
+			++$seen_code_points;
+			$code_point_at = $position_in_input;
+			$decoder_state = UTF8_DECODER_ACCEPT;
+		} else {
+			++$position_in_input;
+		}
+	}
 
-	return ord( $state_table[ 256 + $state + $classification ] );
+	return $buffer;
 }
 
 /**

From 144df5a3847b7655a1d5ff6a99375193b0bd697a Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Mon, 23 Sep 2024 19:47:51 -0700
Subject: [PATCH 3/3] Play with decoding.

---
 src/wp-includes/formatting.php | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 36c8a9394b08c..122b4665566bc 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -4166,10 +4166,17 @@ function wp_trim_words( $text, $num_words = 55, $more = null ) {
 				for ( $i = $at; $i < $i_end; $i++ ) {
 					$state = utf8_decoder_apply_byte( $buffer[ $i ], $state );
 
-					// @todo Skip over invalid UTF-8 bytes and do not copy them.
+					// Replace sequence of invalid bytes as U+FFFD `�`.
 					if ( UTF8_DECODER_REJECT === $state ) {
-						$at = $i;
-						continue 2;
+						$output .= "\u{FFFD}";
+
+						// Skip to the start of the next code point.
+						while ( UTF8_DECODER_REJECT === $state && $i < $i_end ) {
+							$state = utf8_decoder_apply_byte( $text[ ++$i ], UTF8_DECODER_ACCEPT );
+						}
+
+						$at = --$i;
+						continue;
 					}
 
 					if ( UTF8_DECODER_ACCEPT !== $state ) {
@@ -4179,7 +4186,7 @@ function wp_trim_words( $text, $num_words = 55, $more = null ) {
 					++$length;
 
 					if ( $length === $num_words ) {
-						$output .= substr( $buffer, $at, $i - $at );
+						$output .= substr( $buffer, $at, $i - $at  + 1 );
 						continue;
 					}