Skip to content

Commit

Permalink
Consider "gap" between tokens for range query (#11610)
Browse files Browse the repository at this point in the history
## Summary

This PR updates the methods on `Tokens` struct to consider the "gap"
between tokens. Additionally, it adds a constraint to the methods which
is that the offsets shouldn't be within the token range otherwise it'll
panic.

## Test Plan

Add unit test cases.
  • Loading branch information
dhruvmanila authored May 30, 2024
1 parent 4da0675 commit d7b180d
Show file tree
Hide file tree
Showing 2 changed files with 258 additions and 28 deletions.
4 changes: 2 additions & 2 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,7 @@ impl<'src> Lexer<'src> {
}

bitflags! {
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) struct TokenFlags: u8 {
/// The token is a string with double quotes (`"`).
const DOUBLE_QUOTES = 1 << 0;
Expand Down Expand Up @@ -1468,7 +1468,7 @@ impl TokenFlags {
}
}

#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token {
/// The kind of the token.
kind: TokenKind,
Expand Down
282 changes: 256 additions & 26 deletions crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,48 +388,107 @@ impl Tokens {
&self.raw[..end]
}

/// Returns a slice of the [`Token`] that are within the given `range`.
/// Returns a slice of [`Token`] that are within the given `range`.
///
/// The start and end position of the given range should correspond to the start position of
/// the first token and the end position of the last token in the returned slice.
/// The start and end offset of the given range should be either:
/// 1. Token boundary
/// 2. Gap between the tokens
///
/// For example, considering the following tokens and their corresponding range:
///
/// ```txt
/// Def 0..3
/// Name 4..7
/// Lpar 7..8
/// Rpar 8..9
/// Colon 9..10
/// Ellipsis 11..14
/// Newline 14..14
/// ```
/// | Token | Range |
/// |---------------------|-----------|
/// | `Def` | `0..3` |
/// | `Name` | `4..7` |
/// | `Lpar` | `7..8` |
/// | `Rpar` | `8..9` |
/// | `Colon` | `9..10` |
/// | `Newline` | `10..11` |
/// | `Comment` | `15..24` |
/// | `NonLogicalNewline` | `24..25` |
/// | `Indent` | `25..29` |
/// | `Pass` | `29..33` |
///
/// The range `4..10` would return a slice of `Name`, `Lpar`, `Rpar`, and `Colon` tokens. But,
/// if either the start or end position of the given range doesn't match any of the tokens
/// (like `5..10` or `4..12`), the returned slice will be empty.
/// Here, for (1) a token boundary is considered either the start or end offset of any of the
/// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
/// token which are 12, 13, and 14.
///
/// Examples:
/// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
/// 2) `11..25` would give `Comment`, `NonLogicalNewline`
/// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
/// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
/// 5) `18..27` would panic because both the start and end offset is within a token
///
/// ## Note
///
/// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
/// error encountered within the given range.
///
/// # Panics
///
/// If either the start or end offset of the given range is within a token range.
pub fn tokens_in_range(&self, range: TextRange) -> &[Token] {
let Ok(start) = self.binary_search_by_key(&range.start(), Ranged::start) else {
return &[];
};
let Ok(end) = self[start..].binary_search_by_key(&range.end(), Ranged::end) else {
return &[];
};
&self[start..=start + end]
let tokens_after_start = self.after(range.start());

match tokens_after_start.binary_search_by_key(&range.end(), Ranged::end) {
Ok(idx) => {
// If we found the token with the end offset, that token should be included in the
// return slice.
&tokens_after_start[..=idx]
}
Err(idx) => {
if let Some(token) = tokens_after_start.get(idx) {
// If it's equal to the start offset, then it's at a token boundary which is
// valid. If it's less than the start offset, then it's in the gap between the
// tokens which is valid as well.
assert!(
range.end() <= token.start(),
"End offset {:?} is inside a token range {:?}",
range.end(),
token.range()
);
}

// This index is where the token with the offset _could_ be, so that token should
// be excluded from the return slice.
&tokens_after_start[..idx]
}
}
}

/// Returns a slice of tokens after the given [`TextSize`] offset.
///
/// If the given offset lies within a token, the returned slice will start from the token after
/// that. In other words, the returned slice will not include the token containing the offset.
/// If the given offset is between two tokens, the returned slice will start from the following
/// token. In other words, if the offset is between the end of previous token and start of next
/// token, the returned slice will start from the next token.
///
/// # Panics
///
/// If the given offset is inside a token range.
pub fn after(&self, offset: TextSize) -> &[Token] {
let start = self.partition_point(|token| token.start() < offset);
&self[start..]
match self.binary_search_by(|token| token.start().cmp(&offset)) {
Ok(idx) => &self[idx..],
Err(idx) => {
// We can't use `saturating_sub` here because a file could contain a BOM header, in
// which case the token starts at offset 3 for UTF-8 encoded file content.
if idx > 0 {
if let Some(prev) = self.get(idx - 1) {
// If it's equal to the end offset, then it's at a token boundary which is
// valid. If it's greater than the end offset, then it's in the gap between
// the tokens which is valid as well.
assert!(
offset >= prev.end(),
"Offset {:?} is inside a token range {:?}",
offset,
prev.range()
);
}
}

&self[idx..]
}
}
}
}

Expand Down Expand Up @@ -506,3 +565,174 @@ impl std::fmt::Display for ModeParseError {
write!(f, r#"mode must be "exec", "eval", "ipython", or "single""#)
}
}

#[cfg(test)]
mod tests {
use std::ops::Range;

use crate::lexer::TokenFlags;

use super::*;

/// Test case containing a "gap" between two tokens.
///
/// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
(TokenKind::Def, 0..3),
(TokenKind::Name, 4..7),
(TokenKind::Lpar, 7..8),
(TokenKind::Rpar, 8..9),
(TokenKind::Colon, 9..10),
(TokenKind::Newline, 10..11),
// Gap ||..||
(TokenKind::Comment, 15..24),
(TokenKind::NonLogicalNewline, 24..25),
(TokenKind::Indent, 25..29),
(TokenKind::Pass, 29..33),
// No newline at the end to keep the token set full of unique tokens
];

/// Test case containing [`TokenKind::Unknown`] token.
///
/// Code: <https://play.ruff.rs/ea722760-9bf5-4d00-be9f-dc441793f88e>
const TEST_CASE_WITH_UNKNOWN: [(TokenKind, Range<u32>); 5] = [
(TokenKind::Name, 0..1),
(TokenKind::Equal, 2..3),
(TokenKind::Unknown, 4..11),
(TokenKind::Plus, 11..12),
(TokenKind::Int, 13..14),
// No newline at the end to keep the token set full of unique tokens
];

/// Helper function to create [`Tokens`] from an iterator of (kind, range).
fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
Tokens::new(
tokens
.map(|(kind, range)| {
Token::new(
kind,
TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
TokenFlags::empty(),
)
})
.collect(),
)
}

#[test]
fn tokens_up_to_first_unknown_empty() {
let tokens = Tokens::new(vec![]);
assert_eq!(tokens.up_to_first_unknown(), &[]);
}

#[test]
fn tokens_up_to_first_unknown_noop() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let up_to_first_unknown = tokens.up_to_first_unknown();
assert_eq!(up_to_first_unknown.len(), tokens.len());
}

#[test]
fn tokens_up_to_first_unknown() {
let tokens = new_tokens(TEST_CASE_WITH_UNKNOWN.into_iter());
let up_to_first_unknown = tokens.up_to_first_unknown();
assert_eq!(up_to_first_unknown.len(), 2);
}

#[test]
fn tokens_after_offset_at_token_start() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let after = tokens.after(TextSize::new(8));
assert_eq!(after.len(), 7);
assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);
}

#[test]
fn tokens_after_offset_at_token_end() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let after = tokens.after(TextSize::new(11));
assert_eq!(after.len(), 4);
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
}

#[test]
fn tokens_after_offset_between_tokens() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let after = tokens.after(TextSize::new(13));
assert_eq!(after.len(), 4);
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
}

#[test]
fn tokens_after_offset_at_last_token_end() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let after = tokens.after(TextSize::new(33));
assert_eq!(after.len(), 0);
}

#[test]
#[should_panic(expected = "Offset 5 is inside a token range 4..7")]
fn tokens_after_offset_inside_token() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.after(TextSize::new(5));
}

#[test]
fn tokens_in_range_at_token_offset() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let in_range = tokens.tokens_in_range(TextRange::new(4.into(), 10.into()));
assert_eq!(in_range.len(), 4);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);
}

#[test]
fn tokens_in_range_start_offset_at_token_end() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let in_range = tokens.tokens_in_range(TextRange::new(11.into(), 29.into()));
assert_eq!(in_range.len(), 3);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
}

#[test]
fn tokens_in_range_end_offset_at_token_start() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let in_range = tokens.tokens_in_range(TextRange::new(8.into(), 15.into()));
assert_eq!(in_range.len(), 3);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
}

#[test]
fn tokens_in_range_start_offset_between_tokens() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let in_range = tokens.tokens_in_range(TextRange::new(13.into(), 29.into()));
assert_eq!(in_range.len(), 3);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
}

#[test]
fn tokens_in_range_end_offset_between_tokens() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let in_range = tokens.tokens_in_range(TextRange::new(9.into(), 13.into()));
assert_eq!(in_range.len(), 2);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
}

#[test]
#[should_panic(expected = "Offset 5 is inside a token range 4..7")]
fn tokens_in_range_start_offset_inside_token() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.tokens_in_range(TextRange::new(5.into(), 10.into()));
}

#[test]
#[should_panic(expected = "End offset 6 is inside a token range 4..7")]
fn tokens_in_range_end_offset_inside_token() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.tokens_in_range(TextRange::new(0.into(), 6.into()));
}
}

0 comments on commit d7b180d

Please sign in to comment.