Skip to content

Commit

Permalink
fixup! WIP: Rely on Unicode processing within a RegExp pattern for ti…
Browse files Browse the repository at this point in the history
…tle sanitization.
  • Loading branch information
dmsnell committed Jul 1, 2024
1 parent 54d8afe commit 3b332b3
Showing 1 changed file with 87 additions and 26 deletions.
113 changes: 87 additions & 26 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -2712,50 +2712,111 @@ function slugify( string $sluggee ): string {
*/
function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) {
$title = strip_tags( $title );
// Preserve escaped octets.
$title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title );
// Remove percent signs that are not part of an octet.
$title = str_replace( '%', '', $title );
// Restore octets.
$title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title );

if ( seems_utf8( $title ) ) {
if ( function_exists( 'mb_strtolower' ) ) {
$title = mb_strtolower( $title, 'UTF-8' );
}

$title = utf8_uri_encode( $title, 200 );
}

$title = strtolower( $title );

if ( 'save' === $context ) {
$title = html_entity_decode( $title, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 );
$title = rawurldecode( $title );
// Dash/hyphen symbols plus whitespace turn into a dash.
$title = preg_replace( '~[\p{Pd}\s/]+~u', '-', $title );
// Control characters, combining marks, symbols, punctuation, and invisible characters are removed.
$title = preg_replace_callback(
'~-[_-]+|[\p{C}\p{M}\p{S}\p{P}\p{Z}]+~u',
static function ( $chunk ) {
switch ( $chunk[0] ) {
case '-':
return '-';

case '_':
return '_';

default:
return '';
}
},
$title
);
// Convert &nbsp, &ndash, and &mdash to hyphens.
$title = str_replace( array( '%c2%a0', '%e2%80%93', '%e2%80%94' ), '-', $title );
// Convert &nbsp, &ndash, and &mdash HTML entities to hyphens.
$title = str_replace( array( ' ', ' ', '–', '–', '—', '—' ), '-', $title );
// Convert forward slash to hyphen.
$title = str_replace( '/', '-', $title );

// Strip these characters entirely.
$title = str_replace(
array(
"\xD7", // Replace multiplication sign "×" with "x".
'', // Remove invalid decoded characters.
// Soft hyphens.
'%c2%ad',
// &iexcl and &iquest.
'%c2%a1',
'%c2%bf',
// Angle quotes.
'%c2%ab',
'%c2%bb',
'%e2%80%b9',
'%e2%80%ba',
// Curly quotes.
'%e2%80%98',
'%e2%80%99',
'%e2%80%9c',
'%e2%80%9d',
'%e2%80%9a',
'%e2%80%9b',
'%e2%80%9e',
'%e2%80%9f',
// Bullet.
'%e2%80%a2',
// &copy, &reg, &deg, &hellip, and &trade.
'%c2%a9',
'%c2%ae',
'%c2%b0',
'%e2%80%a6',
'%e2%84%a2',
// Acute accents.
'%c2%b4',
'%cb%8a',
'%cc%81',
'%cd%81',
// Grave accent, macron, caron.
'%cc%80',
'%cc%84',
'%cc%8c',
// Non-visible characters that display without a width.
'%e2%80%8b', // Zero width space.
'%e2%80%8c', // Zero width non-joiner.
'%e2%80%8d', // Zero width joiner.
'%e2%80%8e', // Left-to-right mark.
'%e2%80%8f', // Right-to-left mark.
'%e2%80%aa', // Left-to-right embedding.
'%e2%80%ab', // Right-to-left embedding.
'%e2%80%ac', // Pop directional formatting.
'%e2%80%ad', // Left-to-right override.
'%e2%80%ae', // Right-to-left override.
'%ef%bb%bf', // Byte order mark.
'%ef%bf%bc', // Object replacement character.
),
'',
$title
);

// Convert non-visible characters that display with a width to hyphen.
$title = str_replace(
array(
'x',
'',
'%e2%80%80', // En quad.
'%e2%80%81', // Em quad.
'%e2%80%82', // En space.
'%e2%80%83', // Em space.
'%e2%80%84', // Three-per-em space.
'%e2%80%85', // Four-per-em space.
'%e2%80%86', // Six-per-em space.
'%e2%80%87', // Figure space.
'%e2%80%88', // Punctuation space.
'%e2%80%89', // Thin space.
'%e2%80%8a', // Hair space.
'%e2%80%a8', // Line separator.
'%e2%80%a9', // Paragraph separator.
'%e2%80%af', // Narrow no-break space.
),
'-',
$title
);

// Convert &times to 'x'.
$title = str_replace( '%c3%97', 'x', $title );
}

// Remove HTML entities.
Expand Down

0 comments on commit 3b332b3

Please sign in to comment.