From 14c1cc3585ff9509344db504a07b2748f5a15afc Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Tue, 26 Dec 2023 11:47:20 +0800 Subject: [PATCH] Normalise Hex and unicode escape sequences in string --- .../test/fixtures/ruff/expression/bytes.py | 2 + .../test/fixtures/ruff/expression/string.py | 5 + .../src/other/bytes_literal.rs | 2 + .../src/other/f_string.rs | 2 + .../src/other/string_literal.rs | 2 + crates/ruff_python_formatter/src/preview.rs | 5 + .../ruff_python_formatter/src/string/mod.rs | 218 ++++++++++++++++-- ..._preview_format_unicode_escape_seq.py.snap | 97 -------- .../format@expression__bytes.py.snap | 6 + .../format@expression__string.py.snap | 37 +++ 10 files changed, 263 insertions(+), 113 deletions(-) delete mode 100644 crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap diff --git a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py index f505c5945a805..6cbb196a3fff9 100644 --- a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py +++ b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py @@ -118,3 +118,5 @@ b'c' ) } + +b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}" diff --git a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py index 3988b6ab85f7c..ce01296d5f7e7 100644 --- a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py +++ b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py @@ -133,3 +133,8 @@ # https://github.com/astral-sh/ruff/issues/7460 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +a = f"""\x1F""" +a = """\x1F""" +a = """\\x1F""" +a = """\\\x1F""" diff --git a/crates/ruff_python_formatter/src/other/bytes_literal.rs b/crates/ruff_python_formatter/src/other/bytes_literal.rs index c6445c8d6adc3..542928ced38ce 100644 --- a/crates/ruff_python_formatter/src/other/bytes_literal.rs +++ b/crates/ruff_python_formatter/src/other/bytes_literal.rs @@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral; use ruff_text_size::Ranged; use crate::prelude::*; +use crate::preview::is_hex_codes_in_unicode_sequences_enabled; use crate::string::{Quoting, StringPart}; #[derive(Default)] @@ -17,6 +18,7 @@ impl FormatNodeRule for FormatBytesLiteral { &locator, f.options().quote_style(), f.context().docstring(), + is_hex_codes_in_unicode_sequences_enabled(f.context()), ) .fmt(f) } diff --git a/crates/ruff_python_formatter/src/other/f_string.rs b/crates/ruff_python_formatter/src/other/f_string.rs index da81162c2ef54..c3e8ac4ebfc4d 100644 --- a/crates/ruff_python_formatter/src/other/f_string.rs +++ b/crates/ruff_python_formatter/src/other/f_string.rs @@ -2,6 +2,7 @@ use ruff_python_ast::FString; use ruff_text_size::Ranged; use crate::prelude::*; +use crate::preview::is_hex_codes_in_unicode_sequences_enabled; use crate::string::{Quoting, StringPart}; /// Formats an f-string which is part of a larger f-string expression. @@ -31,6 +32,7 @@ impl Format> for FormatFString<'_> { &locator, f.options().quote_style(), f.context().docstring(), + is_hex_codes_in_unicode_sequences_enabled(f.context()), ) .fmt(f); diff --git a/crates/ruff_python_formatter/src/other/string_literal.rs b/crates/ruff_python_formatter/src/other/string_literal.rs index e23db85707830..3071f37098692 100644 --- a/crates/ruff_python_formatter/src/other/string_literal.rs +++ b/crates/ruff_python_formatter/src/other/string_literal.rs @@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral; use ruff_text_size::Ranged; use crate::prelude::*; +use crate::preview::is_hex_codes_in_unicode_sequences_enabled; use crate::string::{docstring, Quoting, StringPart}; use crate::QuoteStyle; @@ -61,6 +62,7 @@ impl Format> for FormatStringLiteral<'_> { &locator, quote_style, f.context().docstring(), + is_hex_codes_in_unicode_sequences_enabled(f.context()), ); if self.layout.is_docstring() { diff --git a/crates/ruff_python_formatter/src/preview.rs b/crates/ruff_python_formatter/src/preview.rs index e34343e582c00..1e2b8ae36bff6 100644 --- a/crates/ruff_python_formatter/src/preview.rs +++ b/crates/ruff_python_formatter/src/preview.rs @@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool { context.is_preview() } + +/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled. +pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool { + context.is_preview() +} diff --git a/crates/ruff_python_formatter/src/string/mod.rs b/crates/ruff_python_formatter/src/string/mod.rs index 57c11cd622900..f74e8002077d3 100644 --- a/crates/ruff_python_formatter/src/string/mod.rs +++ b/crates/ruff_python_formatter/src/string/mod.rs @@ -253,6 +253,7 @@ impl StringPart { locator: &'a Locator, configured_style: QuoteStyle, parent_docstring_quote_char: Option, + normalize_hex: bool, ) -> NormalizedString<'a> { // Per PEP 8, always prefer double quotes for triple-quoted strings. let preferred_style = if self.quotes.triple { @@ -310,7 +311,7 @@ impl StringPart { configured_style }; - let raw_content = locator.slice(self.content_range); + let raw_content = &locator.slice(self.content_range); let quotes = match quoting { Quoting::Preserve => self.quotes, @@ -327,7 +328,7 @@ impl StringPart { } }; - let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix); + let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex); NormalizedString { prefix: self.prefix, @@ -423,6 +424,10 @@ impl StringPrefix { pub(super) const fn is_fstring(self) -> bool { self.contains(StringPrefix::F_STRING) } + + pub(super) const fn is_byte(self) -> bool { + self.contains(StringPrefix::BYTE) + } } impl Format> for StringPrefix { @@ -722,7 +727,12 @@ impl TryFrom for QuoteChar { /// with the provided [`StringQuotes`] style. /// /// Returns the normalized string and whether it contains new lines. -fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow { +fn normalize_string( + input: &str, + quotes: StringQuotes, + prefix: StringPrefix, + normalize_hex: bool, +) -> Cow { // The normalized string if `input` is not yet normalized. // `output` must remain empty if `input` is already normalized. let mut output = String::new(); @@ -766,24 +776,50 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> } last_index = index + '\r'.len_utf8(); - } else if !quotes.triple && !is_raw { + } else if !is_raw { if c == '\\' { - if let Some((_, next)) = chars.peek().copied() { - #[allow(clippy::if_same_then_else)] - if next == opposite_quote && formatted_value_nesting == 0 { - // Remove the escape by ending before the backslash and starting again with the quote - chars.next(); - output.push_str(&input[last_index..index]); - last_index = index + '\\'.len_utf8(); - } else if next == preferred_quote { - // Quote is already escaped, skip over it. - chars.next(); - } else if next == '\\' { + if let Some((_, next)) = chars.clone().next() { + if next == '\\' { // Skip over escaped backslashes chars.next(); + } else if normalize_hex { + if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte()) + .and_then(|escape| { + escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..]) + }) + { + // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) + let escape_start_len = '\\'.len_utf8() + next.len_utf8(); + let escape_start_offset = index + escape_start_len; + if let Cow::Owned(normalised) = &normalised { + output.push_str(&input[last_index..escape_start_offset]); + output.push_str(normalised); + last_index = escape_start_offset + normalised.len(); + }; + + // Move the `chars` iterator passed the escape sequence. + // Simply reassigning `chars` doesn't work because the indices` would + // then be off. + for _ in 0..next.len_utf8() + normalised.len() { + chars.next(); + } + } + } + + if !quotes.triple { + #[allow(clippy::if_same_then_else)] + if next == opposite_quote && formatted_value_nesting == 0 { + // Remove the escape by ending before the backslash and starting again with the quote + chars.next(); + output.push_str(&input[last_index..index]); + last_index = index + '\\'.len_utf8(); + } else if next == preferred_quote { + // Quote is already escaped, skip over it. + chars.next(); + } } } - } else if c == preferred_quote && formatted_value_nesting == 0 { + } else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 { // Escape the quote output.push_str(&input[last_index..index]); output.push('\\'); @@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> normalized } + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum UnicodeEscape { + /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters. + Hex(usize), + + /// An escaped unicode name (`\N{name}`) + CharacterName, +} + +impl UnicodeEscape { + fn new(first: char, allow_unicode: bool) -> Option { + Some(match first { + 'x' => UnicodeEscape::Hex(2), + 'u' if allow_unicode => UnicodeEscape::Hex(4), + 'U' if allow_unicode => UnicodeEscape::Hex(8), + 'N' if allow_unicode => UnicodeEscape::CharacterName, + _ => return None, + }) + } + + /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to: + /// + /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`. + /// * `\N`: To use uppercase letters + fn normalize(self, input: &str) -> Option> { + let mut normalised = String::new(); + + let len = match self { + UnicodeEscape::Hex(len) => { + // It's not a valid escape sequence if the input string has fewer characters + // left than required by the escape sequence. + if input.len() < len { + return None; + } + + for (index, c) in input.char_indices().take(len) { + match c { + '0'..='9' | 'a'..='f' => { + if !normalised.is_empty() { + normalised.push(c); + } + } + 'A'..='F' => { + if normalised.is_empty() { + normalised.reserve(len); + normalised.push_str(&input[..index]); + normalised.push(c.to_ascii_lowercase()); + } else { + normalised.push(c.to_ascii_lowercase()); + } + } + _ => { + // not a valid escape sequence + return None; + } + } + } + + len + } + UnicodeEscape::CharacterName => { + let mut char_indices = input.char_indices(); + + if !matches!(char_indices.next(), Some((_, '{'))) { + return None; + } + + loop { + if let Some((index, c)) = char_indices.next() { + match c { + '}' => { + if !normalised.is_empty() { + normalised.push('}'); + } + + // Name must be at least two characters long. + if index < 3 { + return None; + } + + break index + '}'.len_utf8(); + } + '0'..='9' | 'A'..='Z' | ' ' | '-' => { + if !normalised.is_empty() { + normalised.push(c); + } + } + 'a'..='z' => { + if normalised.is_empty() { + normalised.reserve(c.len_utf8() + '}'.len_utf8()); + normalised.push_str(&input[..index]); + normalised.push(c.to_ascii_uppercase()); + } else { + normalised.push(c.to_ascii_uppercase()); + } + } + _ => { + // Seems like an invalid escape sequence, don't normalise it. + return None; + } + } + } else { + // Unterminated escape sequence, dont' normalise it. + return None; + } + } + } + }; + + Some(if normalised.is_empty() { + Cow::Borrowed(&input[..len]) + } else { + Cow::Owned(normalised) + }) + } +} + +#[cfg(test)] +mod tests { + use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape}; + use std::borrow::Cow; + + #[test] + fn normalize_32_escape() { + let escape_sequence = UnicodeEscape::new('U', true).unwrap(); + + assert_eq!( + Some(Cow::Owned("0001f60e".to_string())), + escape_sequence.normalize("0001F60E") + ); + } + + #[test] + fn normalize_hex_in_byte_string() { + let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"; + + let normalized = normalize_string( + input, + StringQuotes { + triple: false, + quote_char: QuoteChar::Double, + }, + StringPrefix::BYTE, + true, + ); + + assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized); + } +} diff --git a/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap b/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap deleted file mode 100644 index c523ecc557c70..0000000000000 --- a/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap +++ /dev/null @@ -1,97 +0,0 @@ ---- -source: crates/ruff_python_formatter/tests/fixtures.rs -input_file: crates/ruff_python_formatter/resources/test/fixtures/black/cases/preview_format_unicode_escape_seq.py ---- -## Input - -```python -x = "\x1F" -x = "\\x1B" -x = "\\\x1B" -x = "\U0001F60E" -x = "\u0001F60E" -x = r"\u0001F60E" -x = "don't format me" -x = "\xA3" -x = "\u2717" -x = "\uFaCe" -x = "\N{ox}\N{OX}" -x = "\N{lAtIn smaLL letteR x}" -x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" -x = b"\x1Fdon't byte" -x = rb"\x1Fdon't format" -``` - -## Black Differences - -```diff ---- Black -+++ Ruff -@@ -1,15 +1,15 @@ --x = "\x1f" -+x = "\x1F" - x = "\\x1B" --x = "\\\x1b" --x = "\U0001f60e" -+x = "\\\x1B" -+x = "\U0001F60E" - x = "\u0001F60E" - x = r"\u0001F60E" - x = "don't format me" --x = "\xa3" -+x = "\xA3" - x = "\u2717" --x = "\uface" --x = "\N{OX}\N{OX}" --x = "\N{LATIN SMALL LETTER X}" --x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" --x = b"\x1fdon't byte" -+x = "\uFaCe" -+x = "\N{ox}\N{OX}" -+x = "\N{lAtIn smaLL letteR x}" -+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" -+x = b"\x1Fdon't byte" - x = rb"\x1Fdon't format" -``` - -## Ruff Output - -```python -x = "\x1F" -x = "\\x1B" -x = "\\\x1B" -x = "\U0001F60E" -x = "\u0001F60E" -x = r"\u0001F60E" -x = "don't format me" -x = "\xA3" -x = "\u2717" -x = "\uFaCe" -x = "\N{ox}\N{OX}" -x = "\N{lAtIn smaLL letteR x}" -x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" -x = b"\x1Fdon't byte" -x = rb"\x1Fdon't format" -``` - -## Black Output - -```python -x = "\x1f" -x = "\\x1B" -x = "\\\x1b" -x = "\U0001f60e" -x = "\u0001F60E" -x = r"\u0001F60E" -x = "don't format me" -x = "\xa3" -x = "\u2717" -x = "\uface" -x = "\N{OX}\N{OX}" -x = "\N{LATIN SMALL LETTER X}" -x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" -x = b"\x1fdon't byte" -x = rb"\x1Fdon't format" -``` - - diff --git a/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap b/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap index b6c249c609557..06b5d4775be28 100644 --- a/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap +++ b/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap @@ -124,6 +124,8 @@ test_particular = [ b'c' ) } + +b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}" ``` ## Outputs @@ -277,6 +279,8 @@ test_particular = [ # Parenthesized string continuation with messed up indentation {"key": ([], b"a" b"b" b"c")} + +b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}" ``` @@ -430,6 +434,8 @@ test_particular = [ # Parenthesized string continuation with messed up indentation {'key': ([], b'a' b'b' b'c')} + +b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}" ``` diff --git a/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap b/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap index 3f6b55d014394..42519a8e5ae08 100644 --- a/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap +++ b/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap @@ -139,6 +139,11 @@ x = (b"""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa""" b"""bbbbbbbbbbbbbbbbbbbbbbbbbbb # https://github.com/astral-sh/ruff/issues/7460 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +a = f"""\x1F""" +a = """\x1F""" +a = """\\x1F""" +a = """\\\x1F""" ``` ## Outputs @@ -316,6 +321,11 @@ x = ( # https://github.com/astral-sh/ruff/issues/7460 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +a = f"""\x1F""" +a = """\x1F""" +a = """\\x1F""" +a = """\\\x1F""" ``` @@ -329,6 +339,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] '" test' '" test' +@@ -158,7 +159,7 @@ + # https://github.com/astral-sh/ruff/issues/7460 + trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +-a = f"""\x1F""" +-a = """\x1F""" ++a = f"""\x1f""" ++a = """\x1f""" + a = """\\x1F""" +-a = """\\\x1F""" ++a = """\\\x1f""" ``` @@ -506,6 +527,11 @@ x = ( # https://github.com/astral-sh/ruff/issues/7460 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +a = f"""\x1F""" +a = """\x1F""" +a = """\\x1F""" +a = """\\\x1F""" ``` @@ -519,6 +545,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] '" test' '" test' +@@ -158,7 +159,7 @@ + # https://github.com/astral-sh/ruff/issues/7460 + trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"'''] + +-a = f"""\x1F""" +-a = """\x1F""" ++a = f"""\x1f""" ++a = """\x1f""" + a = """\\x1F""" +-a = """\\\x1F""" ++a = """\\\x1f""" ```