From 2f1bb50cd5932e50051a2c40d4f493e4a549d2cd Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Mon, 27 May 2024 22:23:52 +0530 Subject: [PATCH] Update parser API to merge lexing and parsing (#11494) ## Summary This PR updates the parser API within the `ruff_python_parser` crate. It doesn't change any of the references in this PR. The final API looks like: ```rs pub fn parse_module(source: &str) -> Result, ParseError> {} pub fn parse_expression(source: &str) -> Result, ParseError> {} pub fn parse_expression_range( source: &str, range: TextRange, ) -> Result, ParseError> {} pub fn parse(source: &str, mode: Mode) -> Result, ParseError> {} // Temporary. The `parse` will replace this function once we update the downstream // tools to work with programs containing syntax error. pub fn parse_unchecked(source: &str, mode: Mode) -> Program {} ``` Following is a detailed list of changes: * Make `Program` generic over `T` which can be either `Mod` (enum), `ModModule` or `ModExpression` * Add helper methods to cast `Mod` into `ModModule` or `ModExpression` * Add helper method `Program::into_result` which converts a `Program` into a `Result, ParseError>` where the `Err` variant contains the first `ParseError` * Update `TokenSource` to store the comment ranges * Parser crate depends on `ruff_python_trivia` because of `CommentRanges`. This struct could possibly be moved in the parser crate itself at the end * Move from `parse_expression_starts_at` to `parse_expression_range` which parses the source code at the given range using `Mode::Expression`. Unlike the `starts_at` variant, this accepts the entire source code * Remove all access to the `Lexer` * Remove all `parse_*` functions which works on the tokens provided by the caller ## Test Plan The good news is that the tests in `ruff_python_parser` can be run. So, ``` cargo insta test --package ruff_python_parser ``` --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 21 +- crates/ruff_python_parser/src/lib.rs | 492 ++++++++---------- crates/ruff_python_parser/src/parser/mod.rs | 92 +--- crates/ruff_python_parser/src/parser/tests.rs | 30 +- crates/ruff_python_parser/src/string.rs | 192 ++++--- crates/ruff_python_parser/src/token_source.rs | 31 +- crates/ruff_python_parser/src/typing.rs | 19 +- crates/ruff_python_parser/tests/fixtures.rs | 18 +- 10 files changed, 383 insertions(+), 514 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 96e0ebde771bf..74e3fb12ed4d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2297,6 +2297,7 @@ dependencies = [ "itertools 0.12.1", "memchr", "ruff_python_ast", + "ruff_python_trivia", "ruff_source_file", "ruff_text_size", "rustc-hash", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index fc064e6f0a84d..00ac193efedf0 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -14,6 +14,7 @@ license = { workspace = true } [dependencies] ruff_python_ast = { workspace = true } +ruff_python_trivia = { workspace = true } ruff_text_size = { workspace = true } anyhow = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 1606d02081882..0c1455923a089 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -9,23 +9,6 @@ //! as a `Result`, where [`Spanned`] is a tuple containing the //! start and end [`TextSize`] and a [`Tok`] denoting the token. //! -//! # Example -//! -//! ``` -//! use ruff_python_parser::{lexer::lex, Tok, Mode}; -//! -//! let source = "x = 'RustPython'"; -//! let tokens = lex(source, Mode::Module) -//! .map(|tok| tok.expect("Failed to lex")) -//! .collect::>(); -//! -//! for (token, range) in tokens { -//! println!( -//! "{token:?}@{range:?}", -//! ); -//! } -//! ``` -//! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html use std::{char, cmp::Ordering, str::FromStr}; @@ -1381,6 +1364,10 @@ impl Token { self.kind } + pub(crate) const fn is_comment(self) -> bool { + matches!(self.kind, TokenKind::Comment) + } + pub(crate) const fn is_trivia(self) -> bool { matches!(self.kind, TokenKind::Comment | TokenKind::NonLogicalNewline) } diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 1c24b9764f2a1..cf232e8efca53 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -60,67 +60,23 @@ //! - parser: This module contains an interface to the [Program] and is responsible for generating the AST. //! - mode: This module contains the definition of the different modes that the `ruff_python_parser` can be in. //! -//! # Examples -//! -//! For example, to get a stream of tokens from a given string, one could do this: -//! -//! ``` -//! use ruff_python_parser::{lexer::lex, Mode}; -//! -//! let python_source = r#" -//! def is_odd(i): -//! return bool(i & 1) -//! "#; -//! let mut tokens = lex(python_source, Mode::Module); -//! assert!(tokens.all(|t| t.is_ok())); -//! ``` -//! -//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST: -//! -//! ``` -//! use ruff_python_parser::lexer::lex; -//! use ruff_python_parser::{Mode, parse_tokens}; -//! -//! let python_source = r#" -//! def is_odd(i): -//! return bool(i & 1) -//! "#; -//! let tokens = lex(python_source, Mode::Module); -//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module); -//! -//! assert!(ast.is_ok()); -//! ``` -//! -//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific -//! mode or tokenizing the source beforehand: -//! -//! ``` -//! use ruff_python_parser::parse_suite; -//! -//! let python_source = r#" -//! def is_odd(i): -//! return bool(i & 1) -//! "#; -//! let ast = parse_suite(python_source); -//! -//! assert!(ast.is_ok()); -//! ``` -//! //! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis //! [parsing]: https://en.wikipedia.org/wiki/Parsing //! [lexer]: crate::lexer -use std::iter::FusedIterator; +use std::cell::OnceCell; use std::ops::Deref; -use crate::lexer::{lex, lex_starts_at, LexResult}; - pub use crate::error::{FStringErrorType, ParseError, ParseErrorType}; -pub use crate::parser::Program; +pub use crate::lexer::Token; pub use crate::token::{Tok, TokenKind}; -use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite}; -use ruff_text_size::{Ranged, TextRange, TextSize}; +use crate::parser::Parser; + +use itertools::Itertools; +use ruff_python_ast::{Expr, Mod, ModExpression, ModModule, PySourceType, Suite}; +use ruff_python_trivia::CommentRanges; +use ruff_text_size::{Ranged, TextRange}; mod error; pub mod lexer; @@ -131,7 +87,7 @@ mod token_set; mod token_source; pub mod typing; -/// Parse a full Python program usually consisting of multiple lines. +/// Parse a full Python module usually consisting of multiple lines. /// /// This is a convenience function that can be used to parse a full Python program without having to /// specify the [`Mode`] or the location. It is probably what you want to use most of the time. @@ -141,37 +97,7 @@ pub mod typing; /// For example, parsing a simple function definition and a call to that function: /// /// ``` -/// use ruff_python_parser::parse_program; -/// -/// let source = r#" -/// def foo(): -/// return 42 -/// -/// print(foo()) -/// "#; -/// -/// let program = parse_program(source); -/// assert!(program.is_ok()); -/// ``` -pub fn parse_program(source: &str) -> Result { - let lexer = lex(source, Mode::Module); - match parse_tokens(lexer.collect(), source, Mode::Module)? { - Mod::Module(m) => Ok(m), - Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"), - } -} - -/// Parse a full Python program into a [`Suite`]. -/// -/// This function is similar to [`parse_program`] except that it returns the module body -/// instead of the module itself. -/// -/// # Example -/// -/// For example, parsing a simple function definition and a call to that function: -/// -/// ``` -/// use ruff_python_parser::parse_suite; +/// use ruff_python_parser::parse_module; /// /// let source = r#" /// def foo(): @@ -180,11 +106,15 @@ pub fn parse_program(source: &str) -> Result { /// print(foo()) /// "#; /// -/// let body = parse_suite(source); -/// assert!(body.is_ok()); +/// let module = parse_module(source); +/// assert!(module.is_ok()); /// ``` -pub fn parse_suite(source: &str) -> Result { - parse_program(source).map(|m| m.body) +pub fn parse_module(source: &str) -> Result, ParseError> { + Parser::new(source, Mode::Module) + .parse() + .try_into_module() + .unwrap() + .into_result() } /// Parses a single Python expression. @@ -202,37 +132,40 @@ pub fn parse_suite(source: &str) -> Result { /// let expr = parse_expression("1 + 2"); /// assert!(expr.is_ok()); /// ``` -pub fn parse_expression(source: &str) -> Result { - let lexer = lex(source, Mode::Expression).collect(); - match parse_tokens(lexer, source, Mode::Expression)? { - Mod::Expression(expression) => Ok(*expression.body), - Mod::Module(_m) => unreachable!("Mode::Expression doesn't return other variant"), - } +pub fn parse_expression(source: &str) -> Result, ParseError> { + Parser::new(source, Mode::Expression) + .parse() + .try_into_expression() + .unwrap() + .into_result() } -/// Parses a Python expression from a given location. +/// Parses a Python expression for the given range in the source. /// -/// This function allows to specify the location of the expression in the source code, other than +/// This function allows to specify the range of the expression in the source code, other than /// that, it behaves exactly like [`parse_expression`]. /// /// # Example /// -/// Parsing a single expression denoting the addition of two numbers, but this time specifying a different, -/// somewhat silly, location: +/// Parsing one of the numeric literal which is part of an addition expression: /// /// ``` -/// use ruff_python_parser::parse_expression_starts_at; -/// # use ruff_text_size::TextSize; +/// use ruff_python_parser::parse_expression_range; +/// # use ruff_text_size::{TextRange, TextSize}; /// -/// let expr = parse_expression_starts_at("1 + 2", TextSize::from(400)); -/// assert!(expr.is_ok()); +/// let program = parse_expression_range("11 + 22 + 33", TextRange::new(TextSize::new(5), TextSize::new(7))); +/// assert!(program.is_ok()); /// ``` -pub fn parse_expression_starts_at(source: &str, offset: TextSize) -> Result { - let lexer = lex_starts_at(source, Mode::Module, offset).collect(); - match parse_tokens(lexer, source, Mode::Expression)? { - Mod::Expression(expression) => Ok(*expression.body), - Mod::Module(_m) => unreachable!("Mode::Expression doesn't return other variant"), - } +pub fn parse_expression_range( + source: &str, + range: TextRange, +) -> Result, ParseError> { + let source = &source[..range.end().to_usize()]; + Parser::new_starts_at(source, Mode::Expression, range.start()) + .parse() + .try_into_expression() + .unwrap() + .into_result() } /// Parse the given Python source code using the specified [`Mode`]. @@ -249,8 +182,8 @@ pub fn parse_expression_starts_at(source: &str, offset: TextSize) -> Result Result Result { - let lxr = lexer::lex(source, mode); - parse_tokens(lxr.collect(), source, mode) +pub fn parse(source: &str, mode: Mode) -> Result, ParseError> { + parse_unchecked(source, mode).into_result() } -/// Parse the given Python source code using the specified [`Mode`] and [`TextSize`]. -/// -/// This function allows to specify the location of the source code, other than -/// that, it behaves exactly like [`parse`]. -/// -/// # Example -/// -/// ``` -/// # use ruff_text_size::TextSize; -/// use ruff_python_parser::{Mode, parse_starts_at}; -/// -/// let source = r#" -/// def fib(i): -/// a, b = 0, 1 -/// for _ in range(i): -/// a, b = b, a + b -/// return a +/// Parse the given Python source code using the specified [`Mode`]. /// -/// print(fib(42)) -/// "#; -/// let program = parse_starts_at(source, Mode::Module, TextSize::from(0)); -/// assert!(program.is_ok()); -/// ``` -pub fn parse_starts_at(source: &str, mode: Mode, offset: TextSize) -> Result { - let lxr = lexer::lex_starts_at(source, mode, offset); - parse_tokens(lxr.collect(), source, mode) +/// This is same as the [`parse`] function except that it doesn't check for any [`ParseError`] +/// and returns the [`Program`] as is. +pub fn parse_unchecked(source: &str, mode: Mode) -> Program { + Parser::new(source, mode).parse() } -/// Parse an iterator of [`LexResult`]s using the specified [`Mode`]. -/// -/// This could allow you to perform some preprocessing on the tokens before parsing them. -/// -/// # Example -/// -/// As an example, instead of parsing a string, we can parse a list of tokens after we generate -/// them using the [`lexer::lex`] function: -/// -/// ``` -/// use ruff_python_parser::lexer::lex; -/// use ruff_python_parser::{Mode, parse_tokens}; -/// -/// let source = "1 + 2"; -/// let tokens = lex(source, Mode::Expression); -/// let expr = parse_tokens(tokens.collect(), source, Mode::Expression); -/// assert!(expr.is_ok()); -/// ``` -pub fn parse_tokens(tokens: Vec, source: &str, mode: Mode) -> Result { - let program = Program::parse_tokens(source, tokens, mode); - if program.is_valid() { - Ok(program.into_ast()) - } else { - Err(program.into_errors().into_iter().next().unwrap()) - } +/// Parse the given Python source code using the specificed [`PySourceType`]. +pub fn parse_unchecked_source(source: &str, source_type: PySourceType) -> Program { + // SAFETY: Safe because `PySourceType` always parses to a `ModModule` + Parser::new(source, source_type.as_mode()) + .parse() + .try_into_module() + .unwrap() } -/// Tokens represents a vector of [`LexResult`]. -/// -/// This should only include tokens up to and including the first error. This struct is created -/// by the [`tokenize`] function. -#[derive(Debug, Clone)] -pub struct Tokens(Vec); +/// Represents the parsed source code. +#[derive(Debug)] +pub struct Program { + syntax: T, + tokens: Tokens, + errors: Vec, + comment_ranges: CommentRanges, +} -impl Tokens { - /// Returns an iterator over the [`TokenKind`] and the range corresponding to the tokens. - pub fn kinds(&self) -> TokenKindIter { - TokenKindIter::new(&self.0) +impl Program { + /// Returns the syntax node represented by this program. + pub fn syntax(&self) -> &T { + &self.syntax } - /// Returns an iterator over the [`TokenKind`] and its range for all the tokens that are - /// within the given `range`. - /// - /// The start and end position of the given range should correspond to the start position of - /// the first token and the end position of the last token in the returned iterator. - /// - /// For example, if the struct contains the following tokens: - /// ```txt - /// (Def, 0..3) - /// (Name, 4..7) - /// (Lpar, 7..8) - /// (Rpar, 8..9) - /// (Colon, 9..10) - /// (Ellipsis, 11..14) - /// (Newline, 14..14) - /// ``` - /// - /// Then, the range `4..10` returns an iterator which yields `Name`, `Lpar`, `Rpar`, and - /// `Colon` token. But, if the given position doesn't match any of the tokens, an empty - /// iterator is returned. - pub fn kinds_within_range(&self, ranged: T) -> TokenKindIter { - let Ok(start_index) = self.binary_search_by_key(&ranged.start(), |result| match result { - Ok((_, range)) => range.start(), - Err(error) => error.location().start(), - }) else { - return TokenKindIter::default(); - }; + /// Returns all the tokens for the program. + pub fn tokens(&self) -> &Tokens { + &self.tokens + } - let Ok(end_index) = self.binary_search_by_key(&ranged.end(), |result| match result { - Ok((_, range)) => range.end(), - Err(error) => error.location().end(), - }) else { - return TokenKindIter::default(); - }; + /// Returns a list of syntax errors found during parsing. + pub fn errors(&self) -> &[ParseError] { + &self.errors + } - TokenKindIter::new(self.get(start_index..=end_index).unwrap_or(&[])) + /// Returns the comment ranges for the program. + pub fn comment_ranges(&self) -> &CommentRanges { + &self.comment_ranges } - /// Consumes the [`Tokens`], returning the underlying vector of [`LexResult`]. - pub fn into_inner(self) -> Vec { - self.0 + /// Consumes the [`Program`] and returns the syntax node represented by this program. + pub fn into_syntax(self) -> T { + self.syntax } -} -impl Deref for Tokens { - type Target = [LexResult]; + /// Consumes the [`Program`] and returns a list of syntax errors found during parsing. + pub fn into_errors(self) -> Vec { + self.errors + } - fn deref(&self) -> &Self::Target { - &self.0 + /// Returns `true` if the program is valid i.e., it has no syntax errors. + pub fn is_valid(&self) -> bool { + self.errors.is_empty() } -} -/// An iterator over the [`TokenKind`] and the corresponding range. -/// -/// This struct is created by the [`Tokens::kinds`] method. -#[derive(Clone, Default)] -pub struct TokenKindIter<'a> { - inner: std::iter::Flatten>, + /// Transforms the [`Program`] into a [`Result`], returning [`Ok`] if the program has no syntax + /// errors, or [`Err`] containing the first [`ParseError`] encountered. + pub fn into_result(self) -> Result, ParseError> { + if self.is_valid() { + Ok(self) + } else { + Err(self.into_errors().into_iter().next().unwrap()) + } + } } -impl<'a> TokenKindIter<'a> { - /// Create a new iterator from a slice of [`LexResult`]. - pub fn new(tokens: &'a [LexResult]) -> Self { - Self { - inner: tokens.iter().flatten(), +impl Program { + /// Attempts to convert the [`Program`] into a [`Program`]. + /// + /// This method checks if the `syntax` field of the program is a [`Mod::Module`]. If it is, the + /// method returns [`Some(Program)`] with the contained module. Otherwise, it + /// returns [`None`]. + /// + /// [`Some(Program)`]: Some + fn try_into_module(self) -> Option> { + match self.syntax { + Mod::Module(module) => Some(Program { + syntax: module, + tokens: self.tokens, + errors: self.errors, + comment_ranges: self.comment_ranges, + }), + Mod::Expression(_) => None, } } - /// Return the next value without advancing the iterator. - pub fn peek(&mut self) -> Option<(TokenKind, TextRange)> { - self.clone().next() + /// Attempts to convert the [`Program`] into a [`Program`]. + /// + /// This method checks if the `syntax` field of the program is a [`Mod::Expression`]. If it is, + /// the method returns [`Some(Program)`] with the contained expression. + /// Otherwise, it returns [`None`]. + /// + /// [`Some(Program)`]: Some + fn try_into_expression(self) -> Option> { + match self.syntax { + Mod::Module(_) => None, + Mod::Expression(expression) => Some(Program { + syntax: expression, + tokens: self.tokens, + errors: self.errors, + comment_ranges: self.comment_ranges, + }), + } } } -impl Iterator for TokenKindIter<'_> { - type Item = (TokenKind, TextRange); +impl Program { + /// Returns the module body contained in this program as a [`Suite`]. + pub fn suite(&self) -> &Suite { + &self.syntax.body + } - fn next(&mut self) -> Option { - let &(ref tok, range) = self.inner.next()?; - Some((TokenKind::from_token(tok), range)) + /// Consumes the [`Program`] and returns the module body as a [`Suite`]. + pub fn into_suite(self) -> Suite { + self.syntax.body } } -impl FusedIterator for TokenKindIter<'_> {} +impl Program { + /// Returns the expression contained in this program. + pub fn expr(&self) -> &Expr { + &self.syntax.body + } -impl DoubleEndedIterator for TokenKindIter<'_> { - fn next_back(&mut self) -> Option { - let &(ref tok, range) = self.inner.next_back()?; - Some((TokenKind::from_token(tok), range)) + /// Consumes the [`Program`] and returns the parsed [`Expr`]. + pub fn into_expr(self) -> Expr { + *self.syntax.body } } -/// Collect tokens up to and including the first error. -pub fn tokenize(contents: &str, mode: Mode) -> Tokens { - let mut tokens: Vec = allocate_tokens_vec(contents); - for tok in lexer::lex(contents, mode) { - let is_err = tok.is_err(); - tokens.push(tok); - if is_err { - break; - } - } +/// Tokens represents a vector of lexed [`Token`]. +#[derive(Debug)] +pub struct Tokens { + raw: Vec, - Tokens(tokens) + /// Index of the first [`TokenKind::Unknown`] token or the length of the token vector. + first_unknown_or_len: OnceCell, } -/// Tokenizes all tokens. -/// -/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop -/// after the first `Err`. -pub fn tokenize_all(contents: &str, mode: Mode) -> Vec { - let mut tokens = allocate_tokens_vec(contents); - for token in lexer::lex(contents, mode) { - tokens.push(token); +impl Tokens { + pub(crate) fn new(tokens: Vec) -> Tokens { + Tokens { + raw: tokens, + first_unknown_or_len: OnceCell::new(), + } } - tokens -} -/// Allocates a [`Vec`] with an approximated capacity to fit all tokens -/// of `contents`. -/// -/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation. -pub fn allocate_tokens_vec(contents: &str) -> Vec { - Vec::with_capacity(approximate_tokens_lower_bound(contents)) -} + /// Returns a slice of tokens up to (and excluding) the first [`TokenKind::Unknown`] token or + /// all the tokens if there is none. + pub fn up_to_first_unknown(&self) -> &[Token] { + let end = *self.first_unknown_or_len.get_or_init(|| { + self.raw + .iter() + .find_position(|token| token.kind() == TokenKind::Unknown) + .map(|(idx, _)| idx) + .unwrap_or_else(|| self.raw.len()) + }); + &self.raw[..end] + } -/// Approximates the number of tokens when lexing `contents`. -fn approximate_tokens_lower_bound(contents: &str) -> usize { - contents.len().saturating_mul(15) / 100 + /// Returns a slice of the [`Token`] that are within the given `range`. + /// + /// The start and end position of the given range should correspond to the start position of + /// the first token and the end position of the last token in the returned slice. + /// + /// For example, considering the following tokens and their corresponding range: + /// + /// ```txt + /// Def 0..3 + /// Name 4..7 + /// Lpar 7..8 + /// Rpar 8..9 + /// Colon 9..10 + /// Ellipsis 11..14 + /// Newline 14..14 + /// ``` + /// + /// The range `4..10` would return a slice of `Name`, `Lpar`, `Rpar`, and `Colon` tokens. But, + /// if either the start or end position of the given range doesn't match any of the tokens + /// (like `5..10` or `4..12`), the returned slice will be empty. + pub fn tokens_in_range(&self, range: TextRange) -> &[Token] { + let Ok(start) = self.binary_search_by_key(&range.start(), Ranged::start) else { + return &[]; + }; + let Ok(end) = self[start..].binary_search_by_key(&range.end(), Ranged::end) else { + return &[]; + }; + &self[start..=start + end] + } } -/// Parse a full Python program from its tokens. -pub fn parse_program_tokens( - tokens: Tokens, - source: &str, - is_jupyter_notebook: bool, -) -> anyhow::Result { - let mode = if is_jupyter_notebook { - Mode::Ipython - } else { - Mode::Module - }; - match parse_tokens(tokens.into_inner(), source, mode)? { - Mod::Module(m) => Ok(m.body), - Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"), +impl Deref for Tokens { + type Target = [Token]; + + fn deref(&self) -> &Self::Target { + &self.raw } } diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs index 8dc54ac9de6cc..a2e1ef687c054 100644 --- a/crates/ruff_python_parser/src/parser/mod.rs +++ b/crates/ruff_python_parser/src/parser/mod.rs @@ -2,15 +2,16 @@ use std::cmp::Ordering; use bitflags::bitflags; -use ruff_python_ast as ast; +use ruff_python_ast::{Mod, ModExpression, ModModule}; use ruff_text_size::{Ranged, TextRange, TextSize}; -use crate::lexer::{Token, TokenValue}; +use crate::lexer::TokenValue; use crate::parser::expression::ExpressionContext; use crate::parser::progress::{ParserProgress, TokenId}; use crate::token_set::TokenSet; use crate::token_source::{TokenSource, TokenSourceCheckpoint}; use crate::{Mode, ParseError, ParseErrorType, TokenKind}; +use crate::{Program, Tokens}; mod expression; mod helpers; @@ -21,57 +22,6 @@ mod statement; #[cfg(test)] mod tests; -/// Represents the parsed source code. -/// -/// This includes the AST and all of the errors encountered during parsing. -#[derive(Debug)] -pub struct Program { - ast: ast::Mod, - tokens: Vec, - parse_errors: Vec, -} - -impl Program { - /// Returns the parsed AST. - pub fn ast(&self) -> &ast::Mod { - &self.ast - } - - /// Returns all the tokens for the program. - pub fn tokens(&self) -> &[Token] { - &self.tokens - } - - /// Returns a list of syntax errors found during parsing. - pub fn errors(&self) -> &[ParseError] { - &self.parse_errors - } - - /// Consumes the [`Program`] and returns the parsed AST. - pub fn into_ast(self) -> ast::Mod { - self.ast - } - - /// Consumes the [`Program`] and returns a list of syntax errors found during parsing. - pub fn into_errors(self) -> Vec { - self.parse_errors - } - - /// Returns `true` if the program is valid i.e., it has no syntax errors. - pub fn is_valid(&self) -> bool { - self.parse_errors.is_empty() - } - - /// Parse the given Python source code using the specified [`Mode`]. - pub fn parse(source: &str, mode: Mode) -> Program { - Parser::new(source, mode).parse_program() - } - - pub fn parse_starts_at(source: &str, mode: Mode, start_offset: TextSize) -> Program { - Parser::new_starts_at(source, mode, start_offset).parse_program() - } -} - #[derive(Debug)] pub(crate) struct Parser<'src> { source: &'src str, @@ -122,13 +72,13 @@ impl<'src> Parser<'src> { } /// Consumes the [`Parser`] and returns the parsed [`Program`]. - pub(crate) fn parse_program(mut self) -> Program { - let ast = match self.mode { - Mode::Expression => ast::Mod::Expression(self.parse_single_expression()), - Mode::Module | Mode::Ipython => ast::Mod::Module(self.parse_module()), + pub(crate) fn parse(mut self) -> Program { + let syntax = match self.mode { + Mode::Expression => Mod::Expression(self.parse_single_expression()), + Mode::Module | Mode::Ipython => Mod::Module(self.parse_module()), }; - self.finish(ast) + self.finish(syntax) } /// Parses a single expression. @@ -139,7 +89,7 @@ impl<'src> Parser<'src> { /// /// After parsing a single expression, an error is reported and all remaining tokens are /// dropped by the parser. - fn parse_single_expression(&mut self) -> ast::ModExpression { + fn parse_single_expression(&mut self) -> ModExpression { let start = self.node_start(); let parsed_expr = self.parse_expression_list(ExpressionContext::default()); @@ -165,7 +115,7 @@ impl<'src> Parser<'src> { self.bump(TokenKind::EndOfFile); - ast::ModExpression { + ModExpression { body: Box::new(parsed_expr.expr), range: self.node_range(start), } @@ -174,7 +124,7 @@ impl<'src> Parser<'src> { /// Parses a Python module. /// /// This is to be used for [`Mode::Module`] and [`Mode::Ipython`]. - fn parse_module(&mut self) -> ast::ModModule { + fn parse_module(&mut self) -> ModModule { let body = self.parse_list_into_vec( RecoveryContextKind::ModuleStatements, Parser::parse_statement, @@ -182,13 +132,13 @@ impl<'src> Parser<'src> { self.bump(TokenKind::EndOfFile); - ast::ModModule { + ModModule { body, range: TextRange::new(self.start_offset, self.current_token_range().end()), } } - fn finish(self, ast: ast::Mod) -> Program { + fn finish(self, syntax: Mod) -> Program { assert_eq!( self.current_token_kind(), TokenKind::EndOfFile, @@ -197,16 +147,17 @@ impl<'src> Parser<'src> { // TODO consider re-integrating lexical error handling into the parser? let parse_errors = self.errors; - let (tokens, lex_errors) = self.tokens.finish(); + let (tokens, comment_ranges, lex_errors) = self.tokens.finish(); // Fast path for when there are no lex errors. // There's no fast path for when there are no parse errors because a lex error // always results in a parse error. if lex_errors.is_empty() { return Program { - ast, - tokens, - parse_errors, + syntax, + tokens: Tokens::new(tokens), + comment_ranges, + errors: parse_errors, }; } @@ -235,9 +186,10 @@ impl<'src> Parser<'src> { merged.extend(lex_errors.map(ParseError::from)); Program { - ast, - tokens, - parse_errors: merged, + syntax, + tokens: Tokens::new(tokens), + comment_ranges, + errors: merged, } } diff --git a/crates/ruff_python_parser/src/parser/tests.rs b/crates/ruff_python_parser/src/parser/tests.rs index ec23d01d277f5..8de198b8eb253 100644 --- a/crates/ruff_python_parser/src/parser/tests.rs +++ b/crates/ruff_python_parser/src/parser/tests.rs @@ -1,4 +1,4 @@ -use crate::{lex, parse, parse_expression, parse_suite, parse_tokens, Mode}; +use crate::{parse, parse_expression, parse_module, Mode}; #[test] fn test_modes() { @@ -45,23 +45,23 @@ fn test_expr_mode_valid_syntax() { let source = "first "; - let expr = parse_expression(source).unwrap(); + let program = parse_expression(source).unwrap(); - insta::assert_debug_snapshot!(expr); + insta::assert_debug_snapshot!(program.expr()); } #[test] fn test_unicode_aliases() { // https://github.com/RustPython/RustPython/issues/4566 let source = r#"x = "\N{BACKSPACE}another cool trick""#; - let parse_ast = parse_suite(source).unwrap(); + let suite = parse_module(source).unwrap().into_suite(); - insta::assert_debug_snapshot!(parse_ast); + insta::assert_debug_snapshot!(suite); } #[test] fn test_ipython_escape_commands() { - let parse_ast = parse( + let program = parse( r" # Normal Python code ( @@ -132,21 +132,5 @@ foo.bar[0].baz[2].egg?? Mode::Ipython, ) .unwrap(); - insta::assert_debug_snapshot!(parse_ast); -} - -#[test] -fn test_ipython_escape_command_parse_error() { - let source = r" -a = 1 -%timeit a == 1 - " - .trim(); - let lxr = lex(source, Mode::Ipython); - let parse_err = parse_tokens(lxr.collect(), source, Mode::Module).unwrap_err(); - assert_eq!( - parse_err.to_string(), - "IPython escape commands are only allowed in `Mode::Ipython` at byte range 6..20" - .to_string() - ); + insta::assert_debug_snapshot!(program.syntax()); } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index bd206d5e46573..772034527a218 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -469,13 +469,19 @@ pub(crate) fn parse_fstring_literal_element( #[cfg(test)] mod tests { + use ruff_python_ast::Suite; + use crate::lexer::LexicalErrorType; - use crate::{parse_suite, FStringErrorType, ParseErrorType, Suite}; + use crate::{parse_module, FStringErrorType, ParseError, ParseErrorType, Program}; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; const UNIX_EOL: &str = "\n"; + fn parse_suite(source: &str) -> Result { + parse_module(source).map(Program::into_suite) + } + fn string_parser_escaped_eol(eol: &str) -> Suite { let source = format!(r"'text \{eol}more text'"); parse_suite(&source).unwrap() @@ -483,73 +489,69 @@ mod tests { #[test] fn test_string_parser_escaped_unix_eol() { - let parse_ast = string_parser_escaped_eol(UNIX_EOL); - insta::assert_debug_snapshot!(parse_ast); + let suite = string_parser_escaped_eol(UNIX_EOL); + insta::assert_debug_snapshot!(suite); } #[test] fn test_string_parser_escaped_mac_eol() { - let parse_ast = string_parser_escaped_eol(MAC_EOL); - insta::assert_debug_snapshot!(parse_ast); + let suite = string_parser_escaped_eol(MAC_EOL); + insta::assert_debug_snapshot!(suite); } #[test] fn test_string_parser_escaped_windows_eol() { - let parse_ast = string_parser_escaped_eol(WINDOWS_EOL); - insta::assert_debug_snapshot!(parse_ast); + let suite = string_parser_escaped_eol(WINDOWS_EOL); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring() { let source = r#"f"{a}{ b }{{foo}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_nested_spec() { let source = r#"f"{foo:{spec}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_not_nested_spec() { let source = r#"f"{foo:spec}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_empty_fstring() { - insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap()); + let source = r#"f"""#; + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_parse_self_documenting_base() { let source = r#"f"{user=}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_parse_self_documenting_base_more() { let source = r#"f"mix {user=} with text and {second=}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_parse_self_documenting_format() { let source = r#"f"{user=:>10}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } fn parse_fstring_error(source: &str) -> FStringErrorType { @@ -577,240 +579,236 @@ mod tests { // error appears after the unexpected `FStringMiddle` token, which is between the // `:` and the `{`. // assert_eq!(parse_fstring_error("f'{lambda x: {x}}'"), LambdaWithoutParentheses); - assert!(parse_suite(r#"f"{class}""#,).is_err()); + assert!(parse_suite(r#"f"{class}""#).is_err()); } #[test] fn test_parse_fstring_not_equals() { let source = r#"f"{1 != 2}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_equals() { let source = r#"f"{42 == 42}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_self_doc_prec_space() { let source = r#"f"{x =}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_self_doc_trailing_space() { let source = r#"f"{x= }""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_yield_expr() { let source = r#"f"{yield}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_string_concat() { let source = "'Hello ' 'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_u_string_concat_1() { let source = "'Hello ' u'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_u_string_concat_2() { let source = "u'Hello ' 'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_f_string_concat_1() { let source = "'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_f_string_concat_2() { let source = "'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_f_string_concat_3() { let source = "'Hello ' f'world{\"!\"}'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_f_string_concat_4() { let source = "'Hello ' f'world{\"!\"}' 'again!'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_u_f_string_concat_1() { let source = "u'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_u_f_string_concat_2() { let source = "u'Hello ' f'world' '!'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_string_triple_quotes_with_kind() { let source = "u'''Hello, world!'''"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_single_quoted_byte() { // single quote let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_double_quoted_byte() { // double quote let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_escape_char_in_byte_literal() { // backslash does not escape let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_raw_byte_literal_1() { let source = r"rb'\x1z'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_raw_byte_literal_2() { let source = r"rb'\\'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_escape_octet() { let source = r"b'\43a\4\1234'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_escaped_newline() { let source = r#"f"\n{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_constant_range() { let source = r#"f"aaa{bbb}ccc{ddd}eee""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_unescaped_newline() { let source = r#"f""" {x}""""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_escaped_character() { let source = r#"f"\\{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_raw_fstring() { let source = r#"rf"{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_triple_quoted_raw_fstring() { let source = r#"rf"""{x}""""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_fstring_line_continuation() { let source = r#"rf"\ {x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_nested_string_spec() { let source = r#"f"{foo:{''}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_parse_fstring_nested_concatenation_string_spec() { let source = r#"f"{foo:{'' ''}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } /// #[test] fn test_dont_panic_on_8_in_octal_escape() { let source = r"bold = '\038[1m'"; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(suite); } #[test] fn test_invalid_unicode_literal() { let source = r"'\x1ó34'"; let error = parse_suite(source).unwrap_err(); - insta::assert_debug_snapshot!(error); } @@ -818,7 +816,6 @@ mod tests { fn test_missing_unicode_lbrace_error() { let source = r"'\N '"; let error = parse_suite(source).unwrap_err(); - insta::assert_debug_snapshot!(error); } @@ -826,7 +823,6 @@ mod tests { fn test_missing_unicode_rbrace_error() { let source = r"'\N{SPACE'"; let error = parse_suite(source).unwrap_err(); - insta::assert_debug_snapshot!(error); } @@ -834,7 +830,6 @@ mod tests { fn test_invalid_unicode_name_error() { let source = r"'\N{INVALID}'"; let error = parse_suite(source).unwrap_err(); - insta::assert_debug_snapshot!(error); } @@ -842,7 +837,6 @@ mod tests { fn test_invalid_byte_literal_error() { let source = r"b'123a𝐁c'"; let error = parse_suite(source).unwrap_err(); - insta::assert_debug_snapshot!(error); } @@ -852,8 +846,8 @@ mod tests { #[test] fn $name() { let source = format!(r#""\N{{{0}}}""#, $alias); - let parse_ast = parse_suite(&source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); + let suite = parse_suite(&source).unwrap(); + insta::assert_debug_snapshot!(suite); } )* } diff --git a/crates/ruff_python_parser/src/token_source.rs b/crates/ruff_python_parser/src/token_source.rs index e2828be891e4a..4ccebc8eb7992 100644 --- a/crates/ruff_python_parser/src/token_source.rs +++ b/crates/ruff_python_parser/src/token_source.rs @@ -1,4 +1,5 @@ -use ruff_text_size::{TextRange, TextSize}; +use ruff_python_trivia::CommentRanges; +use ruff_text_size::{Ranged, TextRange, TextSize}; use crate::lexer::{Lexer, LexerCheckpoint, LexicalError, Token, TokenValue}; use crate::{Mode, TokenKind}; @@ -13,14 +14,19 @@ pub(crate) struct TokenSource<'src> { /// is finished consuming all the tokens. Note that unlike the emitted tokens, this vector /// holds both the trivia and non-trivia tokens. tokens: Vec, + + /// A vector containing the range of all the comment tokens emitted by the lexer. + comments: Vec, } impl<'src> TokenSource<'src> { /// Create a new token source for the given lexer. pub(crate) fn new(lexer: Lexer<'src>) -> Self { + // TODO(dhruvmanila): Use `allocate_tokens_vec` TokenSource { lexer, tokens: vec![], + comments: vec![], } } @@ -85,6 +91,9 @@ impl<'src> TokenSource<'src> { loop { let next = self.lexer.next_token(); if next.is_trivia() { + if next.is_comment() { + self.comments.push(next.range()); + } self.tokens.push(next); continue; } @@ -92,7 +101,7 @@ impl<'src> TokenSource<'src> { } } - /// Returns the next non-trivia token without adding it to the token vector. + /// Returns the next non-trivia token without adding it to any vector. fn next_non_trivia_token(&mut self) -> TokenKind { loop { let next = self.lexer.next_token(); @@ -108,6 +117,7 @@ impl<'src> TokenSource<'src> { TokenSourceCheckpoint { lexer: self.lexer.checkpoint(), tokens_position: self.tokens.len(), + comments_position: self.comments.len(), } } @@ -115,22 +125,35 @@ impl<'src> TokenSource<'src> { pub(crate) fn rewind(&mut self, checkpoint: TokenSourceCheckpoint<'src>) { self.lexer.rewind(checkpoint.lexer); self.tokens.truncate(checkpoint.tokens_position); + self.comments.truncate(checkpoint.comments_position); } /// Consumes the token source, returning the collected tokens and any errors encountered during /// lexing. The token collection includes both the trivia and non-trivia tokens. - pub(crate) fn finish(self) -> (Vec, Vec) { + pub(crate) fn finish(self) -> (Vec, CommentRanges, Vec) { assert_eq!( self.current_kind(), TokenKind::EndOfFile, "TokenSource was not fully consumed" ); - (self.tokens, self.lexer.finish()) + let comment_ranges = CommentRanges::new(self.comments); + (self.tokens, comment_ranges, self.lexer.finish()) } } pub(crate) struct TokenSourceCheckpoint<'src> { lexer: LexerCheckpoint<'src>, tokens_position: usize, + comments_position: usize, +} + +/// Allocates a [`Vec`] with an approximated capacity to fit all tokens +/// of `contents`. +/// +/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation. +#[allow(dead_code)] +fn allocate_tokens_vec(contents: &str) -> Vec { + let lower_bound = contents.len().saturating_mul(15) / 100; + Vec::with_capacity(lower_bound) } diff --git a/crates/ruff_python_parser/src/typing.rs b/crates/ruff_python_parser/src/typing.rs index c8d82304e90ca..02ebf3243c0b3 100644 --- a/crates/ruff_python_parser/src/typing.rs +++ b/crates/ruff_python_parser/src/typing.rs @@ -6,7 +6,7 @@ use ruff_python_ast::relocate::relocate_expr; use ruff_python_ast::{str, Expr}; use ruff_text_size::{TextLen, TextRange}; -use crate::{parse_expression, parse_expression_starts_at}; +use crate::{parse_expression, parse_expression_range}; #[derive(is_macro::Is, Copy, Clone, Debug)] pub enum AnnotationKind { @@ -22,25 +22,30 @@ pub enum AnnotationKind { Complex, } -/// Parse a type annotation from a string. +/// Parses the value of a string literal node (`parsed_contents`) with `range` as a type +/// annotation. The given `source` is the entire source code. pub fn parse_type_annotation( - value: &str, + parsed_contents: &str, range: TextRange, source: &str, ) -> Result<(Expr, AnnotationKind)> { let expression = &source[range]; - if str::raw_contents(expression).is_some_and(|body| body == value) { + if str::raw_contents(expression).is_some_and(|raw_contents| raw_contents == parsed_contents) { // The annotation is considered "simple" if and only if the raw representation (e.g., // `List[int]` within "List[int]") exactly matches the parsed representation. This // isn't the case, e.g., for implicit concatenations, or for annotations that contain // escaped quotes. - let leading_quote = str::leading_quote(expression).unwrap(); - let expr = parse_expression_starts_at(value, range.start() + leading_quote.text_len())?; + let leading_quote_len = str::leading_quote(expression).unwrap().text_len(); + let trailing_quote_len = str::trailing_quote(expression).unwrap().text_len(); + let range = range + .add_start(leading_quote_len) + .sub_end(trailing_quote_len); + let expr = parse_expression_range(source, range)?.into_expr(); Ok((expr, AnnotationKind::Simple)) } else { // Otherwise, consider this a "complex" annotation. - let mut expr = parse_expression(value)?; + let mut expr = parse_expression(parsed_contents)?.into_expr(); relocate_expr(&mut expr, range); Ok((expr, AnnotationKind::Complex)) } diff --git a/crates/ruff_python_parser/tests/fixtures.rs b/crates/ruff_python_parser/tests/fixtures.rs index 8e77242881825..5d52f94493545 100644 --- a/crates/ruff_python_parser/tests/fixtures.rs +++ b/crates/ruff_python_parser/tests/fixtures.rs @@ -8,7 +8,7 @@ use annotate_snippets::snippet::{AnnotationType, Slice, Snippet, SourceAnnotatio use ruff_python_ast::visitor::preorder::{walk_module, PreorderVisitor, TraversalSignal}; use ruff_python_ast::{AnyNodeRef, Mod}; -use ruff_python_parser::{Mode, ParseErrorType, Program}; +use ruff_python_parser::{parse_unchecked, Mode, ParseErrorType}; use ruff_source_file::{LineIndex, OneIndexed, SourceCode}; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; @@ -36,7 +36,7 @@ fn inline_err() { /// Snapshots the AST. fn test_valid_syntax(input_path: &Path) { let source = fs::read_to_string(input_path).expect("Expected test file to exist"); - let program = Program::parse(&source, Mode::Module); + let program = parse_unchecked(&source, Mode::Module); if !program.is_valid() { let line_index = LineIndex::from_source_text(&source); @@ -60,11 +60,11 @@ fn test_valid_syntax(input_path: &Path) { panic!("{input_path:?}: {message}"); } - validate_ast(program.ast(), source.text_len(), input_path); + validate_ast(program.syntax(), source.text_len(), input_path); let mut output = String::new(); writeln!(&mut output, "## AST").unwrap(); - writeln!(&mut output, "\n```\n{:#?}\n```", program.ast()).unwrap(); + writeln!(&mut output, "\n```\n{:#?}\n```", program.syntax()).unwrap(); insta::with_settings!({ omit_expression => true, @@ -79,18 +79,18 @@ fn test_valid_syntax(input_path: &Path) { /// Snapshots the AST and the error messages. fn test_invalid_syntax(input_path: &Path) { let source = fs::read_to_string(input_path).expect("Expected test file to exist"); - let program = Program::parse(&source, Mode::Module); + let program = parse_unchecked(&source, Mode::Module); assert!( !program.is_valid(), "{input_path:?}: Expected parser to generate at least one syntax error for a program containing syntax errors." ); - validate_ast(program.ast(), source.text_len(), input_path); + validate_ast(program.syntax(), source.text_len(), input_path); let mut output = String::new(); writeln!(&mut output, "## AST").unwrap(); - writeln!(&mut output, "\n```\n{:#?}\n```", program.ast()).unwrap(); + writeln!(&mut output, "\n```\n{:#?}\n```", program.syntax()).unwrap(); writeln!(&mut output, "## Errors\n").unwrap(); @@ -129,9 +129,9 @@ fn parser_quick_test() { data[*x,] "; - let program = Program::parse(source, Mode::Module); + let program = parse_unchecked(source, Mode::Module); - println!("AST:\n----\n{:#?}", program.ast()); + println!("AST:\n----\n{:#?}", program.syntax()); if !program.is_valid() { println!("Errors:\n-------");