From 14c1cc3585ff9509344db504a07b2748f5a15afc Mon Sep 17 00:00:00 2001
From: Micha Reiser <micha@reiser.io>
Date: Tue, 26 Dec 2023 11:47:20 +0800
Subject: [PATCH] Normalise Hex and unicode escape sequences in string

---
 .../test/fixtures/ruff/expression/bytes.py    |   2 +
 .../test/fixtures/ruff/expression/string.py   |   5 +
 .../src/other/bytes_literal.rs                |   2 +
 .../src/other/f_string.rs                     |   2 +
 .../src/other/string_literal.rs               |   2 +
 crates/ruff_python_formatter/src/preview.rs   |   5 +
 .../ruff_python_formatter/src/string/mod.rs   | 218 ++++++++++++++++--
 ..._preview_format_unicode_escape_seq.py.snap |  97 --------
 .../format@expression__bytes.py.snap          |   6 +
 .../format@expression__string.py.snap         |  37 +++
 10 files changed, 263 insertions(+), 113 deletions(-)
 delete mode 100644 crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap
diff --git a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py
index f505c5945a805..6cbb196a3fff9 100644
--- a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py
+++ b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py
@@ -118,3 +118,5 @@
     b'c'
     )
 }
+
+b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
diff --git a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py
index 3988b6ab85f7c..ce01296d5f7e7 100644
--- a/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py
+++ b/crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py
@@ -133,3 +133,8 @@
 
 # https://github.com/astral-sh/ruff/issues/7460
 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+
+a = f"""\x1F"""
+a = """\x1F"""
+a = """\\x1F"""
+a = """\\\x1F"""
diff --git a/crates/ruff_python_formatter/src/other/bytes_literal.rs b/crates/ruff_python_formatter/src/other/bytes_literal.rs
index c6445c8d6adc3..542928ced38ce 100644
--- a/crates/ruff_python_formatter/src/other/bytes_literal.rs
+++ b/crates/ruff_python_formatter/src/other/bytes_literal.rs
@@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
 use ruff_text_size::Ranged;
 
 use crate::prelude::*;
+use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
 use crate::string::{Quoting, StringPart};
 
 #[derive(Default)]
@@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
                 &locator,
                 f.options().quote_style(),
                 f.context().docstring(),
+                is_hex_codes_in_unicode_sequences_enabled(f.context()),
             )
             .fmt(f)
     }
diff --git a/crates/ruff_python_formatter/src/other/f_string.rs b/crates/ruff_python_formatter/src/other/f_string.rs
index da81162c2ef54..c3e8ac4ebfc4d 100644
--- a/crates/ruff_python_formatter/src/other/f_string.rs
+++ b/crates/ruff_python_formatter/src/other/f_string.rs
@@ -2,6 +2,7 @@ use ruff_python_ast::FString;
 use ruff_text_size::Ranged;
 
 use crate::prelude::*;
+use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
 use crate::string::{Quoting, StringPart};
 
 /// Formats an f-string which is part of a larger f-string expression.
@@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
                 &locator,
                 f.options().quote_style(),
                 f.context().docstring(),
+                is_hex_codes_in_unicode_sequences_enabled(f.context()),
             )
             .fmt(f);
 
diff --git a/crates/ruff_python_formatter/src/other/string_literal.rs b/crates/ruff_python_formatter/src/other/string_literal.rs
index e23db85707830..3071f37098692 100644
--- a/crates/ruff_python_formatter/src/other/string_literal.rs
+++ b/crates/ruff_python_formatter/src/other/string_literal.rs
@@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
 use ruff_text_size::Ranged;
 
 use crate::prelude::*;
+use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
 use crate::string::{docstring, Quoting, StringPart};
 use crate::QuoteStyle;
 
@@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
             &locator,
             quote_style,
             f.context().docstring(),
+            is_hex_codes_in_unicode_sequences_enabled(f.context()),
         );
 
         if self.layout.is_docstring() {
diff --git a/crates/ruff_python_formatter/src/preview.rs b/crates/ruff_python_formatter/src/preview.rs
index e34343e582c00..1e2b8ae36bff6 100644
--- a/crates/ruff_python_formatter/src/preview.rs
+++ b/crates/ruff_python_formatter/src/preview.rs
@@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
 pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
     context.is_preview()
 }
+
+/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
+pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
+    context.is_preview()
+}
diff --git a/crates/ruff_python_formatter/src/string/mod.rs b/crates/ruff_python_formatter/src/string/mod.rs
index 57c11cd622900..f74e8002077d3 100644
--- a/crates/ruff_python_formatter/src/string/mod.rs
+++ b/crates/ruff_python_formatter/src/string/mod.rs
@@ -253,6 +253,7 @@ impl StringPart {
         locator: &'a Locator,
         configured_style: QuoteStyle,
         parent_docstring_quote_char: Option<QuoteChar>,
+        normalize_hex: bool,
     ) -> NormalizedString<'a> {
         // Per PEP 8, always prefer double quotes for triple-quoted strings.
         let preferred_style = if self.quotes.triple {
@@ -310,7 +311,7 @@ impl StringPart {
             configured_style
         };
 
-        let raw_content = locator.slice(self.content_range);
+        let raw_content = &locator.slice(self.content_range);
 
         let quotes = match quoting {
             Quoting::Preserve => self.quotes,
@@ -327,7 +328,7 @@ impl StringPart {
             }
         };
 
-        let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
+        let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);
 
         NormalizedString {
             prefix: self.prefix,
@@ -423,6 +424,10 @@ impl StringPrefix {
     pub(super) const fn is_fstring(self) -> bool {
         self.contains(StringPrefix::F_STRING)
     }
+
+    pub(super) const fn is_byte(self) -> bool {
+        self.contains(StringPrefix::BYTE)
+    }
 }
 
 impl Format<PyFormatContext<'_>> for StringPrefix {
@@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
 /// with the provided [`StringQuotes`] style.
 ///
 /// Returns the normalized string and whether it contains new lines.
-fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
+fn normalize_string(
+    input: &str,
+    quotes: StringQuotes,
+    prefix: StringPrefix,
+    normalize_hex: bool,
+) -> Cow<str> {
     // The normalized string if `input` is not yet normalized.
     // `output` must remain empty if `input` is already normalized.
     let mut output = String::new();
@@ -766,24 +776,50 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
             }
 
             last_index = index + '\r'.len_utf8();
-        } else if !quotes.triple && !is_raw {
+        } else if !is_raw {
             if c == '\\' {
-                if let Some((_, next)) = chars.peek().copied() {
-                    #[allow(clippy::if_same_then_else)]
-                    if next == opposite_quote && formatted_value_nesting == 0 {
-                        // Remove the escape by ending before the backslash and starting again with the quote
-                        chars.next();
-                        output.push_str(&input[last_index..index]);
-                        last_index = index + '\\'.len_utf8();
-                    } else if next == preferred_quote {
-                        // Quote is already escaped, skip over it.
-                        chars.next();
-                    } else if next == '\\' {
+                if let Some((_, next)) = chars.clone().next() {
+                    if next == '\\' {
                         // Skip over escaped backslashes
                         chars.next();
+                    } else if normalize_hex {
+                        if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
+                            .and_then(|escape| {
+                                escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
+                            })
+                        {
+                            // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
+                            let escape_start_len = '\\'.len_utf8() + next.len_utf8();
+                            let escape_start_offset = index + escape_start_len;
+                            if let Cow::Owned(normalised) = &normalised {
+                                output.push_str(&input[last_index..escape_start_offset]);
+                                output.push_str(normalised);
+                                last_index = escape_start_offset + normalised.len();
+                            };
+
+                            // Move the `chars` iterator passed the escape sequence.
+                            // Simply reassigning `chars` doesn't work because the indices` would
+                            // then be off.
+                            for _ in 0..next.len_utf8() + normalised.len() {
+                                chars.next();
+                            }
+                        }
+                    }
+
+                    if !quotes.triple {
+                        #[allow(clippy::if_same_then_else)]
+                        if next == opposite_quote && formatted_value_nesting == 0 {
+                            // Remove the escape by ending before the backslash and starting again with the quote
+                            chars.next();
+                            output.push_str(&input[last_index..index]);
+                            last_index = index + '\\'.len_utf8();
+                        } else if next == preferred_quote {
+                            // Quote is already escaped, skip over it.
+                            chars.next();
+                        }
                     }
                 }
-            } else if c == preferred_quote && formatted_value_nesting == 0 {
+            } else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
                 // Escape the quote
                 output.push_str(&input[last_index..index]);
                 output.push('\\');
@@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
 
     normalized
 }
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum UnicodeEscape {
+    /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
+    Hex(usize),
+
+    /// An escaped unicode name (`\N{name}`)
+    CharacterName,
+}
+
+impl UnicodeEscape {
+    fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
+        Some(match first {
+            'x' => UnicodeEscape::Hex(2),
+            'u' if allow_unicode => UnicodeEscape::Hex(4),
+            'U' if allow_unicode => UnicodeEscape::Hex(8),
+            'N' if allow_unicode => UnicodeEscape::CharacterName,
+            _ => return None,
+        })
+    }
+
+    /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
+    ///
+    /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
+    /// * `\N`: To use uppercase letters
+    fn normalize(self, input: &str) -> Option<Cow<str>> {
+        let mut normalised = String::new();
+
+        let len = match self {
+            UnicodeEscape::Hex(len) => {
+                // It's not a valid escape sequence if the input string has fewer characters
+                // left than required by the escape sequence.
+                if input.len() < len {
+                    return None;
+                }
+
+                for (index, c) in input.char_indices().take(len) {
+                    match c {
+                        '0'..='9' | 'a'..='f' => {
+                            if !normalised.is_empty() {
+                                normalised.push(c);
+                            }
+                        }
+                        'A'..='F' => {
+                            if normalised.is_empty() {
+                                normalised.reserve(len);
+                                normalised.push_str(&input[..index]);
+                                normalised.push(c.to_ascii_lowercase());
+                            } else {
+                                normalised.push(c.to_ascii_lowercase());
+                            }
+                        }
+                        _ => {
+                            // not a valid escape sequence
+                            return None;
+                        }
+                    }
+                }
+
+                len
+            }
+            UnicodeEscape::CharacterName => {
+                let mut char_indices = input.char_indices();
+
+                if !matches!(char_indices.next(), Some((_, '{'))) {
+                    return None;
+                }
+
+                loop {
+                    if let Some((index, c)) = char_indices.next() {
+                        match c {
+                            '}' => {
+                                if !normalised.is_empty() {
+                                    normalised.push('}');
+                                }
+
+                                // Name must be at least two characters long.
+                                if index < 3 {
+                                    return None;
+                                }
+
+                                break index + '}'.len_utf8();
+                            }
+                            '0'..='9' | 'A'..='Z' | ' ' | '-' => {
+                                if !normalised.is_empty() {
+                                    normalised.push(c);
+                                }
+                            }
+                            'a'..='z' => {
+                                if normalised.is_empty() {
+                                    normalised.reserve(c.len_utf8() + '}'.len_utf8());
+                                    normalised.push_str(&input[..index]);
+                                    normalised.push(c.to_ascii_uppercase());
+                                } else {
+                                    normalised.push(c.to_ascii_uppercase());
+                                }
+                            }
+                            _ => {
+                                // Seems like an invalid escape sequence, don't normalise it.
+                                return None;
+                            }
+                        }
+                    } else {
+                        // Unterminated escape sequence, dont' normalise it.
+                        return None;
+                    }
+                }
+            }
+        };
+
+        Some(if normalised.is_empty() {
+            Cow::Borrowed(&input[..len])
+        } else {
+            Cow::Owned(normalised)
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape};
+    use std::borrow::Cow;
+
+    #[test]
+    fn normalize_32_escape() {
+        let escape_sequence = UnicodeEscape::new('U', true).unwrap();
+
+        assert_eq!(
+            Some(Cow::Owned("0001f60e".to_string())),
+            escape_sequence.normalize("0001F60E")
+        );
+    }
+
+    #[test]
+    fn normalize_hex_in_byte_string() {
+        let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
+
+        let normalized = normalize_string(
+            input,
+            StringQuotes {
+                triple: false,
+                quote_char: QuoteChar::Double,
+            },
+            StringPrefix::BYTE,
+            true,
+        );
+
+        assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
+    }
+}
diff --git a/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap b/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap
deleted file mode 100644
index c523ecc557c70..0000000000000
--- a/crates/ruff_python_formatter/tests/snapshots/black_compatibility@cases__preview_format_unicode_escape_seq.py.snap
+++ /dev/null
@@ -1,97 +0,0 @@
----
-source: crates/ruff_python_formatter/tests/fixtures.rs
-input_file: crates/ruff_python_formatter/resources/test/fixtures/black/cases/preview_format_unicode_escape_seq.py
----
-## Input
-
-```python
-x = "\x1F"
-x = "\\x1B"
-x = "\\\x1B"
-x = "\U0001F60E"
-x = "\u0001F60E"
-x = r"\u0001F60E"
-x = "don't format me"
-x = "\xA3"
-x = "\u2717"
-x = "\uFaCe"
-x = "\N{ox}\N{OX}"
-x = "\N{lAtIn smaLL letteR x}"
-x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
-x = b"\x1Fdon't byte"
-x = rb"\x1Fdon't format"
-```
-
-## Black Differences
-
-```diff
---- Black
-+++ Ruff
-@@ -1,15 +1,15 @@
--x = "\x1f"
-+x = "\x1F"
- x = "\\x1B"
--x = "\\\x1b"
--x = "\U0001f60e"
-+x = "\\\x1B"
-+x = "\U0001F60E"
- x = "\u0001F60E"
- x = r"\u0001F60E"
- x = "don't format me"
--x = "\xa3"
-+x = "\xA3"
- x = "\u2717"
--x = "\uface"
--x = "\N{OX}\N{OX}"
--x = "\N{LATIN SMALL LETTER X}"
--x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
--x = b"\x1fdon't byte"
-+x = "\uFaCe"
-+x = "\N{ox}\N{OX}"
-+x = "\N{lAtIn smaLL letteR x}"
-+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
-+x = b"\x1Fdon't byte"
- x = rb"\x1Fdon't format"
-```
-
-## Ruff Output
-
-```python
-x = "\x1F"
-x = "\\x1B"
-x = "\\\x1B"
-x = "\U0001F60E"
-x = "\u0001F60E"
-x = r"\u0001F60E"
-x = "don't format me"
-x = "\xA3"
-x = "\u2717"
-x = "\uFaCe"
-x = "\N{ox}\N{OX}"
-x = "\N{lAtIn smaLL letteR x}"
-x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
-x = b"\x1Fdon't byte"
-x = rb"\x1Fdon't format"
-```
-
-## Black Output
-
-```python
-x = "\x1f"
-x = "\\x1B"
-x = "\\\x1b"
-x = "\U0001f60e"
-x = "\u0001F60E"
-x = r"\u0001F60E"
-x = "don't format me"
-x = "\xa3"
-x = "\u2717"
-x = "\uface"
-x = "\N{OX}\N{OX}"
-x = "\N{LATIN SMALL LETTER X}"
-x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
-x = b"\x1fdon't byte"
-x = rb"\x1Fdon't format"
-```
-
-
diff --git a/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap b/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap
index b6c249c609557..06b5d4775be28 100644
--- a/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap
+++ b/crates/ruff_python_formatter/tests/snapshots/format@expression__bytes.py.snap
@@ -124,6 +124,8 @@ test_particular = [
     b'c'
     )
 }
+
+b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
 ```
 
 ## Outputs
@@ -277,6 +279,8 @@ test_particular = [
 
 # Parenthesized string continuation with messed up indentation
 {"key": ([], b"a" b"b" b"c")}
+
+b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
 ```
 
 
@@ -430,6 +434,8 @@ test_particular = [
 
 # Parenthesized string continuation with messed up indentation
 {'key': ([], b'a' b'b' b'c')}
+
+b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
 ```
 
 
diff --git a/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap b/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap
index 3f6b55d014394..42519a8e5ae08 100644
--- a/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap
+++ b/crates/ruff_python_formatter/tests/snapshots/format@expression__string.py.snap
@@ -139,6 +139,11 @@ x = (b"""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa""" b"""bbbbbbbbbbbbbbbbbbbbbbbbbbb
 
 # https://github.com/astral-sh/ruff/issues/7460
 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+
+a = f"""\x1F"""
+a = """\x1F"""
+a = """\\x1F"""
+a = """\\\x1F"""
 ```
 
 ## Outputs
@@ -316,6 +321,11 @@ x = (
 
 # https://github.com/astral-sh/ruff/issues/7460
 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+
+a = f"""\x1F"""
+a = """\x1F"""
+a = """\\x1F"""
+a = """\\\x1F"""
 ```
 
 
@@ -329,6 +339,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
  '" test'
  
  '" test'
+@@ -158,7 +159,7 @@
+ # https://github.com/astral-sh/ruff/issues/7460
+ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+ 
+-a = f"""\x1F"""
+-a = """\x1F"""
++a = f"""\x1f"""
++a = """\x1f"""
+ a = """\\x1F"""
+-a = """\\\x1F"""
++a = """\\\x1f"""
 ```
 
 
@@ -506,6 +527,11 @@ x = (
 
 # https://github.com/astral-sh/ruff/issues/7460
 trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+
+a = f"""\x1F"""
+a = """\x1F"""
+a = """\\x1F"""
+a = """\\\x1F"""
 ```
 
 
@@ -519,6 +545,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
  '" test'
  
  '" test'
+@@ -158,7 +159,7 @@
+ # https://github.com/astral-sh/ruff/issues/7460
+ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
+ 
+-a = f"""\x1F"""
+-a = """\x1F"""
++a = f"""\x1f"""
++a = """\x1f"""
+ a = """\\x1F"""
+-a = """\\\x1F"""
++a = """\\\x1f"""
 ```