From b87fbfb69d28b11ebb1bc6e184049786c163afa3 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sun, 25 Feb 2024 11:06:26 +0100 Subject: [PATCH] Perf: Skip string normalization when possible --- .../src/other/f_string.rs | 8 +- .../src/other/f_string_element.rs | 1 + .../src/string/normalize.rs | 240 ++++++++++++------ 3 files changed, 167 insertions(+), 82 deletions(-) diff --git a/crates/ruff_python_formatter/src/other/f_string.rs b/crates/ruff_python_formatter/src/other/f_string.rs index 52d4497719995c..aa78f7520bf81c 100644 --- a/crates/ruff_python_formatter/src/other/f_string.rs +++ b/crates/ruff_python_formatter/src/other/f_string.rs @@ -59,16 +59,16 @@ impl Format> for FormatFString<'_> { return result; } - let quotes = normalizer.choose_quotes(&string, &locator); + let quote_selection = normalizer.choose_quotes(&string, &locator); let context = FStringContext::new( string.prefix(), - quotes, + quote_selection.quotes(), FStringLayout::from_f_string(self.value, &locator), ); // Starting prefix and quote - write!(f, [string.prefix(), quotes])?; + write!(f, [string.prefix(), quote_selection.quotes()])?; f.join() .entries( @@ -80,7 +80,7 @@ impl Format> for FormatFString<'_> { .finish()?; // Ending quote - quotes.fmt(f) + quote_selection.quotes().fmt(f) } } diff --git a/crates/ruff_python_formatter/src/other/f_string_element.rs b/crates/ruff_python_formatter/src/other/f_string_element.rs index c581413705f043..f95ce285ff9603 100644 --- a/crates/ruff_python_formatter/src/other/f_string_element.rs +++ b/crates/ruff_python_formatter/src/other/f_string_element.rs @@ -59,6 +59,7 @@ impl Format> for FormatFStringLiteralElement<'_> { let literal_content = f.context().locator().slice(self.element.range()); let normalized = normalize_string( literal_content, + 0, self.context.quotes(), self.context.prefix(), is_hex_codes_in_unicode_sequences_enabled(f.context()), diff --git a/crates/ruff_python_formatter/src/string/normalize.rs b/crates/ruff_python_formatter/src/string/normalize.rs index 0047c58981989f..9d5c49f87d388c 100644 --- a/crates/ruff_python_formatter/src/string/normalize.rs +++ b/crates/ruff_python_formatter/src/string/normalize.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::iter::FusedIterator; use ruff_formatter::FormatContext; use ruff_source_file::Locator; @@ -42,68 +43,8 @@ impl StringNormalizer { self } - /// Computes the strings preferred quotes. - pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes { - // Per PEP 8, always prefer double quotes for triple-quoted strings. - // Except when using quote-style-preserve. - let preferred_style = if string.quotes().triple { - // ... unless we're formatting a code snippet inside a docstring, - // then we specifically want to invert our quote style to avoid - // writing out invalid Python. - // - // It's worth pointing out that we can actually wind up being - // somewhat out of sync with PEP8 in this case. Consider this - // example: - // - // def foo(): - // ''' - // Something. - // - // >>> """tricksy""" - // ''' - // pass - // - // Ideally, this would be reformatted as: - // - // def foo(): - // """ - // Something. - // - // >>> '''tricksy''' - // """ - // pass - // - // But the logic here results in the original quoting being - // preserved. This is because the quoting style of the outer - // docstring is determined, in part, by looking at its contents. In - // this case, it notices that it contains a `"""` and thus infers - // that using `'''` would overall read better because it avoids - // the need to escape the interior `"""`. Except... in this case, - // the `"""` is actually part of a code snippet that could get - // reformatted to using a different quoting style itself. - // - // Fixing this would, I believe, require some fairly seismic - // changes to how formatting strings works. Namely, we would need - // to look for code snippets before normalizing the docstring, and - // then figure out the quoting style more holistically by looking - // at the various kinds of quotes used in the code snippets and - // what reformatting them might look like. - // - // Overall this is a bit of a corner case and just inverting the - // style from what the parent ultimately decided upon works, even - // if it doesn't have perfect alignment with PEP8. - if let Some(quote) = self.parent_docstring_quote_char { - QuoteStyle::from(quote.invert()) - } else if self.preferred_quote_style.is_preserve() { - QuoteStyle::Preserve - } else { - QuoteStyle::Double - } - } else { - self.preferred_quote_style - }; - - let quoting = if let FStringState::InsideExpressionElement(context) = self.f_string_state { + fn quoting(&self, string: &StringPart) -> Quoting { + if let FStringState::InsideExpressionElement(context) = self.f_string_state { // If we're inside an f-string, we need to make sure to preserve the // existing quotes unless we're inside a triple-quoted f-string and // the inner string itself isn't triple-quoted. For example: @@ -127,22 +68,110 @@ impl StringNormalizer { } } else { self.quoting - }; + } + } - match quoting { + /// Computes the strings preferred quotes. + pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> QuoteSelection { + let raw_content = locator.slice(string.content_range()); + let first_quote_or_normalized_char_offset = raw_content + .bytes() + .position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{')); + + let quotes = match self.quoting(string) { Quoting::Preserve => string.quotes(), Quoting::CanChange => { + // Per PEP 8, always prefer double quotes for triple-quoted strings. + // Except when using quote-style-preserve. + let preferred_style = if string.quotes().triple { + // ... unless we're formatting a code snippet inside a docstring, + // then we specifically want to invert our quote style to avoid + // writing out invalid Python. + // + // It's worth pointing out that we can actually wind up being + // somewhat out of sync with PEP8 in this case. Consider this + // example: + // + // def foo(): + // ''' + // Something. + // + // >>> """tricksy""" + // ''' + // pass + // + // Ideally, this would be reformatted as: + // + // def foo(): + // """ + // Something. + // + // >>> '''tricksy''' + // """ + // pass + // + // But the logic here results in the original quoting being + // preserved. This is because the quoting style of the outer + // docstring is determined, in part, by looking at its contents. In + // this case, it notices that it contains a `"""` and thus infers + // that using `'''` would overall read better because it avoids + // the need to escape the interior `"""`. Except... in this case, + // the `"""` is actually part of a code snippet that could get + // reformatted to using a different quoting style itself. + // + // Fixing this would, I believe, require some fairly seismic + // changes to how formatting strings works. Namely, we would need + // to look for code snippets before normalizing the docstring, and + // then figure out the quoting style more holistically by looking + // at the various kinds of quotes used in the code snippets and + // what reformatting them might look like. + // + // Overall this is a bit of a corner case and just inverting the + // style from what the parent ultimately decided upon works, even + // if it doesn't have perfect alignment with PEP8. + if let Some(quote) = self.parent_docstring_quote_char { + QuoteStyle::from(quote.invert()) + } else if self.preferred_quote_style.is_preserve() { + QuoteStyle::Preserve + } else { + QuoteStyle::Double + } + } else { + self.preferred_quote_style + }; + if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) { - let raw_content = locator.slice(string.content_range()); - if string.prefix().is_raw_string() { - choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote) + if let Some(first_quote_or_normalized_char_offset) = + first_quote_or_normalized_char_offset + { + if string.prefix().is_raw_string() { + choose_quotes_for_raw_string( + &raw_content[first_quote_or_normalized_char_offset..], + string.quotes(), + preferred_quote, + ) + } else { + choose_quotes_impl( + &raw_content[first_quote_or_normalized_char_offset..], + string.quotes(), + preferred_quote, + ) + } } else { - choose_quotes_impl(raw_content, string.quotes(), preferred_quote) + StringQuotes { + quote_char: preferred_quote, + triple: string.quotes().is_triple(), + } } } else { string.quotes() } } + }; + + QuoteSelection { + quotes, + first_quote_or_normalized_char_offset, } } @@ -154,19 +183,45 @@ impl StringNormalizer { ) -> NormalizedString<'a> { let raw_content = locator.slice(string.content_range()); - let quotes = self.choose_quotes(string, locator); - - let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex); + let quote_selection = self.choose_quotes(string, locator); + + let normalized = if let Some(first_quote_or_escape_offset) = + quote_selection.first_quote_or_normalized_char_offset + { + normalize_string( + raw_content, + first_quote_or_escape_offset, + quote_selection.quotes, + string.prefix(), + self.normalize_hex, + ) + } else { + Cow::Borrowed(raw_content) + }; NormalizedString { prefix: string.prefix(), content_range: string.content_range(), text: normalized, - quotes, + quotes: quote_selection.quotes, } } } +#[derive(Debug)] +pub(crate) struct QuoteSelection { + quotes: StringQuotes, + + /// Offset to the first quote character or character that needs special handling in [`normalize_string`]. + first_quote_or_normalized_char_offset: Option, +} + +impl QuoteSelection { + pub(crate) fn quotes(&self) -> StringQuotes { + self.quotes + } +} + #[derive(Debug)] pub(crate) struct NormalizedString<'a> { prefix: crate::string::StringPrefix, @@ -391,6 +446,7 @@ fn choose_quotes_impl( /// Returns the normalized string and whether it contains new lines. pub(crate) fn normalize_string( input: &str, + start_offset: usize, quotes: StringQuotes, prefix: StringPrefix, normalize_hex: bool, @@ -406,7 +462,7 @@ pub(crate) fn normalize_string( let preferred_quote = quote.as_char(); let opposite_quote = quote.invert().as_char(); - let mut chars = input.char_indices().peekable(); + let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable(); let is_raw = prefix.is_raw_string(); let is_fstring = prefix.is_fstring(); @@ -445,13 +501,11 @@ pub(crate) fn normalize_string( // Skip over escaped backslashes chars.next(); } else if normalize_hex { + // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) + let escape_start_len = '\\'.len_utf8() + next.len_utf8(); if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte()) - .and_then(|escape| { - escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..]) - }) + .and_then(|escape| escape.normalize(&input[index + escape_start_len..])) { - // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) - let escape_start_len = '\\'.len_utf8() + next.len_utf8(); let escape_start_offset = index + escape_start_len; if let Cow::Owned(normalised) = &normalised { output.push_str(&input[last_index..escape_start_offset]); @@ -501,6 +555,35 @@ pub(crate) fn normalize_string( normalized } +#[derive(Clone, Debug)] +struct CharIndicesWithOffset<'str> { + chars: std::str::Chars<'str>, + next_offset: usize, +} + +impl<'str> CharIndicesWithOffset<'str> { + fn new(input: &'str str, start_offset: usize) -> Self { + Self { + chars: input[start_offset..].chars(), + next_offset: start_offset, + } + } +} + +impl<'str> Iterator for CharIndicesWithOffset<'str> { + type Item = (usize, char); + + fn next(&mut self) -> Option { + self.chars.next().map(|c| { + let index = self.next_offset; + self.next_offset += c.len_utf8(); + (index, c) + }) + } +} + +impl FusedIterator for CharIndicesWithOffset<'_> {} + #[derive(Copy, Clone, Debug, PartialEq, Eq)] enum UnicodeEscape { /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters. @@ -642,6 +725,7 @@ mod tests { let normalized = normalize_string( input, + 0, StringQuotes { triple: false, quote_char: QuoteChar::Double,