From 81d01567eb4f07d849562b0005a298d8fccd5841 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 17 Jul 2024 09:56:13 -0700 Subject: [PATCH 1/2] Fix: Provide `_is_utf8_charset()` in `compat.php` for early use. When `is_utf8_charset()` was introduced, the `mb_strlen()` and `mb_substr()` compat functions were modified to call it, but they are defined in `compat.php` before `is_utf8_charset()` is defined in `functions.php`. Certain code calling these compat functions early in the boot process before `functions.php` is included and on hosts without the multi-byte extension would thus crash. In this patch the `is_utf8_charset()` function is split into pure and stateful components. The pure version is recreated as `_is_utf8_charset()` and defined in `compat.php` while the existing function (which defaults to calling `get_option( 'blog_charset' )`) is left in place in `functions.php`. This ensures that code calling it will be able to call a form of the function even in early sequences. Follow-up to [58169]. Props dmsnell, donncha, hellofromTonya, jeherve, slyall, spacedmonkey. Fixes #61680. --- src/wp-includes/compat.php | 38 +++++++++++++++++++++++++++++++++-- src/wp-includes/functions.php | 15 +------------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index c50fc69a047e1..796aacd5a5e5f 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -40,6 +40,40 @@ function _wp_can_use_pcre_u( $set = null ) { return $utf8_pcre; } +/** + * Indicates if a given slug for a character set represents the UTF-8 text encoding. + * + * A charset is considered to represent UTF-8 if it is a case-insensitive match + * of "UTF-8" with or without the hyphen. + * + * Example: + * + * true === _is_utf8_charset( 'UTF-8' ); + * true === _is_utf8_charset( 'utf8' ); + * false === _is_utf8_charset( 'latin1' ); + * false === _is_utf8_charset( 'UTF 8' ); + * + * // Only strings match. + * false === _is_utf8_charset( [ 'charset' => 'utf-8' ] ); + * + * @since 6.6.1 + * + * @param string $charset_slug Slug representing a text character encoding, or "charset". + * E.g. "UTF-8", "Windows-1252", "ISO-8859-1", "SJIS". + * + * @return bool Whether the slug represents the UTF-8 encoding. + */ +function _is_utf8_charset( $charset_slug ) { + if ( ! is_string( $charset_slug ) ) { + return false; + } + + return ( + 0 === strcasecmp( 'UTF-8', $charset_slug ) || + 0 === strcasecmp( 'UTF8', $charset_slug ) + ); +} + if ( ! function_exists( 'mb_substr' ) ) : /** * Compat function to mimic mb_substr(). @@ -91,7 +125,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) { * The solution below works only for UTF-8, so in case of a different * charset just use built-in substr(). */ - if ( ! is_utf8_charset( $encoding ) ) { + if ( ! _is_utf8_charset( $encoding ) ) { return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); } @@ -176,7 +210,7 @@ function _mb_strlen( $str, $encoding = null ) { * The solution below works only for UTF-8, so in case of a different charset * just use built-in strlen(). */ - if ( ! is_utf8_charset( $encoding ) ) { + if ( ! _is_utf8_charset( $encoding ) ) { return strlen( $str ); } diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php index 94155249fef0d..20d288e378b68 100644 --- a/src/wp-includes/functions.php +++ b/src/wp-includes/functions.php @@ -7503,20 +7503,7 @@ function get_tag_regex( $tag ) { * @return bool Whether the slug represents the UTF-8 encoding. */ function is_utf8_charset( $blog_charset = null ) { - $charset_to_examine = $blog_charset ?? get_option( 'blog_charset' ); - - /* - * Only valid string values count: the absence of a charset - * does not imply any charset, let alone UTF-8. - */ - if ( ! is_string( $charset_to_examine ) ) { - return false; - } - - return ( - 0 === strcasecmp( 'UTF-8', $charset_to_examine ) || - 0 === strcasecmp( 'UTF8', $charset_to_examine ) - ); + return _is_utf8_charset( $blog_charset ?? get_option( 'blog_charset' ) ); } /** From 0de0a8173574799aef8093631a0ab256780f2b9d Mon Sep 17 00:00:00 2001 From: Aaron Jorbin Date: Thu, 18 Jul 2024 13:41:49 -0400 Subject: [PATCH 2/2] Update documenation to make clearer which function should be used where. --- src/wp-includes/compat.php | 3 +++ src/wp-includes/functions.php | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index 796aacd5a5e5f..900a7994a1eae 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -56,6 +56,9 @@ function _wp_can_use_pcre_u( $set = null ) { * // Only strings match. * false === _is_utf8_charset( [ 'charset' => 'utf-8' ] ); * + * `is_utf8_charset` should be used outside of this file. + * + * @ignore * @since 6.6.1 * * @param string $charset_slug Slug representing a text character encoding, or "charset". diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php index 20d288e378b68..e821f6f2b08be 100644 --- a/src/wp-includes/functions.php +++ b/src/wp-includes/functions.php @@ -7496,6 +7496,9 @@ function get_tag_regex( $tag ) { * $is_utf8 = is_utf8_charset(); * * @since 6.6.0 + * @since 6.6.1 A wrapper for _is_utf8_charset + * + * @see _is_utf8_charset * * @param string|null $blog_charset Optional. Slug representing a text character encoding, or "charset". * E.g. "UTF-8", "Windows-1252", "ISO-8859-1", "SJIS".