Skip to content

Commit

Permalink
Escape invalid UTF-8 in strings
Browse files Browse the repository at this point in the history
To be friendlier to tooling that expects PHP files to be UTF-8
encoded, escape any sequences that are not legal under UTF-8.
  • Loading branch information
nikic committed Apr 25, 2021
1 parent 6b409b9 commit 33d7c8d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
20 changes: 18 additions & 2 deletions lib/PhpParser/PrettyPrinter/Standard.php
Original file line number Diff line number Diff line change
Expand Up @@ -998,8 +998,24 @@ protected function escapeString($string, $quote) {
$escaped = addcslashes($string, "\n\r\t\f\v$" . $quote . "\\");
}

// Escape other control characters
return preg_replace_callback('/[\x00-\x08\x0e-\x1f]/', function ($matches) {
// Escape control characters and non-UTF-8 characters.
// Regex taken from https://stackoverflow.com/a/11709412/385378.
$regex = '/(
[\x00-\x08\x0E-\x1F] # Control characters
| [\xC0-\xC1] # Invalid UTF-8 Bytes
| [\xF5-\xFF] # Invalid UTF-8 Bytes
| \xE0[\x80-\x9F] # Overlong encoding of prior code point
| \xF0[\x80-\x8F] # Overlong encoding of prior code point
| [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start
| [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start
| [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start
| (?<=[\x00-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle
| (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence
| (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence
| (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence
| (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2)
)/x';
return preg_replace_callback($regex, function ($matches) {
$hex = dechex(ord($matches[0]));;
return '\\x' . str_pad($hex, 2, '0', \STR_PAD_LEFT);
}, $escaped);
Expand Down
4 changes: 4 additions & 0 deletions test/code/prettyPrinter/expr/stringEscaping.test
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Escape sequences in double-quoted strings
"@@{ implode(range("\0", "\37")) }@@";
"\0000\0001";
"äöü";
"\xc0\x80";
"\xd0\x01";

<<<DOC
\n\r\t\f\v\$\"\\
Expand All @@ -18,6 +20,8 @@ DOC;
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
"\x000\x001";
"äöü";
"\xc0\x80";
"\xd0\x01";
<<<DOC
@@{ "\n\r" }@@\t\f\v\$\\"\\
\x00\x01\x02\x03\x04\x05\x06\x07\x08\t@@{ "\n" }@@\v\f@@{ "\r" }@@\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f
Expand Down

0 comments on commit 33d7c8d

Please sign in to comment.