Skip to content

Commit

Permalink
Consider "gap" between tokens for range query
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvmanila committed May 30, 2024
1 parent 06d6feb commit 9d6a0d4
Showing 1 changed file with 200 additions and 25 deletions.
225 changes: 200 additions & 25 deletions crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,48 +388,86 @@ impl Tokens {
&self.raw[..end]
}

/// Returns a slice of the [`Token`] that are within the given `range`.
/// Returns a slice of [`Token`] that are within the given `range`.
///
/// The start and end position of the given range should correspond to the start position of
/// the first token and the end position of the last token in the returned slice.
/// The start and end offset of the given range should be either:
/// 1. Token boundary
/// 2. Gap between the tokens
///
/// For example, considering the following tokens and their corresponding range:
///
/// ```txt
/// Def 0..3
/// Name 4..7
/// Lpar 7..8
/// Rpar 8..9
/// Colon 9..10
/// Ellipsis 11..14
/// Newline 14..14
/// ```
/// | Token | Range |
/// |---------------------|-----------|
/// | `Def` | `0..3` |
/// | `Name` | `4..7` |
/// | `Lpar` | `7..8` |
/// | `Rpar` | `8..9` |
/// | `Colon` | `9..10` |
/// | `Newline` | `10..11` |
/// | `Comment` | `15..24` |
/// | `NonLogicalNewline` | `24..25` |
/// | `Indent` | `25..29` |
/// | `Pass` | `29..33` |
///
/// The range `4..10` would return a slice of `Name`, `Lpar`, `Rpar`, and `Colon` tokens. But,
/// if either the start or end position of the given range doesn't match any of the tokens
/// (like `5..10` or `4..12`), the returned slice will be empty.
/// Here, for (1) a token boundary is considered either the start or end offset of any of the
/// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
/// token which are 12, 13, and 14.
///
/// Examples:
/// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
/// 2) `11..25` would give `Comment`, `NonLogicalNewline`
/// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
/// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
/// 5) `18..27` would panic because both the start and end offset is within a token
///
/// ## Note
///
/// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
/// error encountered within the given range.
///
/// # Panics
///
/// If either the start or end offset of the given range is within a token range.
pub fn tokens_in_range(&self, range: TextRange) -> &[Token] {
let Ok(start) = self.binary_search_by_key(&range.start(), Ranged::start) else {
return &[];
};
let Ok(end) = self[start..].binary_search_by_key(&range.end(), Ranged::end) else {
return &[];
};
&self[start..=start + end]
let tokens_after_start = self.after(range.start());
let end = tokens_after_start.partition_point(|token| token.end() <= range.end());
let result = &tokens_after_start[..end];

if result.last().is_some_and(|last| last.end() != range.end()) {
if tokens_after_start
.get(end)
.is_some_and(|next| next.range().contains(range.end()))
{
panic!("Offset cannot be in between a token")
}
}

result
}

/// Returns a slice of tokens after the given [`TextSize`] offset.
///
/// If the given offset lies within a token, the returned slice will start from the token after
/// that. In other words, the returned slice will not include the token containing the offset.
/// If the given offset is between two tokens, the returned slice will start from the following
/// token. In other words, if the offset is between the end of previous token and start of next
/// token, the returned slice will start from the next token.
///
/// # Panics
///
/// If the given offset is within a token range.
pub fn after(&self, offset: TextSize) -> &[Token] {
let start = self.partition_point(|token| token.start() < offset);
&self[start..]
let result = &self[start..];
// If the offset isn't the start of the first token in slice...
if result.first().is_some_and(|token| token.start() != offset) {
// Panic if it's within the range of the previous token.
if self
.get(start.saturating_sub(1))
.is_some_and(|prev| prev.range().contains(offset))
{
panic!("Offset cannot be in between a token")
}
}
result
}
}

Expand Down Expand Up @@ -506,3 +544,140 @@ impl std::fmt::Display for ModeParseError {
write!(f, r#"mode must be "exec", "eval", "ipython", or "single""#)
}
}

#[cfg(test)]
mod tests {
use std::ops::Range;

use crate::lexer::TokenFlags;

use super::*;

/// Test case containing a "gap" between two tokens.
///
/// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
(TokenKind::Def, 0..3),
(TokenKind::Name, 4..7),
(TokenKind::Lpar, 7..8),
(TokenKind::Rpar, 8..9),
(TokenKind::Colon, 9..10),
(TokenKind::Newline, 10..11),
// Gap ||..||
(TokenKind::Comment, 15..24),
(TokenKind::NonLogicalNewline, 24..25),
(TokenKind::Indent, 25..29),
(TokenKind::Pass, 29..33),
// No newline at the end to keep the token set full of unique tokens
];

/// Test case containing [`TokenKind::Unknown`] token.
///
/// Code: <https://play.ruff.rs/ea722760-9bf5-4d00-be9f-dc441793f88e>
const TEST_CASE_WITH_UNKNOWN: [(TokenKind, Range<u32>); 5] = [
(TokenKind::Name, 0..1),
(TokenKind::Equal, 2..3),
(TokenKind::Unknown, 4..11),
(TokenKind::Plus, 11..12),
(TokenKind::Int, 13..14),
// No newline at the end to keep the token set full of unique tokens
];

/// Helper function to create [`Tokens`] from an iterator of (kind, range).
fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
Tokens::new(
tokens
.map(|(kind, range)| {
Token::new(
kind,
TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
TokenFlags::empty(),
)
})
.collect(),
)
}

#[test]
fn test_tokens_up_to_first_unknown_empty() {
let tokens = Tokens::new(vec![]);
assert_eq!(tokens.up_to_first_unknown(), &[]);
}

#[test]
fn test_tokens_up_to_first_unknown() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
let up_to_first_unknown = tokens.up_to_first_unknown();
assert_eq!(up_to_first_unknown.len(), tokens.len());

let tokens = new_tokens(TEST_CASE_WITH_UNKNOWN.into_iter());
let up_to_first_unknown = tokens.up_to_first_unknown();
assert_eq!(up_to_first_unknown.len(), 2);
}

#[test]
fn test_tokens_after() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());

let after = tokens.after(TextSize::new(8));
assert_eq!(after.len(), 7);
assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);

let after = tokens.after(TextSize::new(11));
assert_eq!(after.len(), 4);
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);

let after = tokens.after(TextSize::new(13));
assert_eq!(after.len(), 4);
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);

let after = tokens.after(TextSize::new(33));
assert_eq!(after.len(), 0);
}

#[test]
#[should_panic(expected = "Offset cannot be in between a token")]
fn test_tokens_after_panic() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.after(TextSize::new(5));
}

#[test]
fn test_tokens_in_range() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());

let in_range = tokens.tokens_in_range(TextRange::new(4.into(), 10.into()));
assert_eq!(in_range.len(), 4);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);

let in_range = tokens.tokens_in_range(TextRange::new(11.into(), 29.into()));
assert_eq!(in_range.len(), 3);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);

let in_range = tokens.tokens_in_range(TextRange::new(9.into(), 13.into()));
assert_eq!(in_range.len(), 2);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);

let in_range = tokens.tokens_in_range(TextRange::new(13.into(), 29.into()));
assert_eq!(in_range.len(), 3);
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
}

#[test]
#[should_panic(expected = "Offset cannot be in between a token")]
fn test_tokens_in_range_start_panic() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.tokens_in_range(TextRange::new(5.into(), 10.into()));
}

#[test]
#[should_panic(expected = "Offset cannot be in between a token")]
fn test_tokens_in_range_end_panic() {
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
tokens.tokens_in_range(TextRange::new(0.into(), 6.into()));
}
}

0 comments on commit 9d6a0d4

Please sign in to comment.