Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

libsyntax: be more accepting of whitespace in lexer #29734

Merged
merged 3 commits into from
Mar 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mk/crates.mk
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ DEPS_serialize := std log
DEPS_term := std log
DEPS_test := std getopts serialize rbml term native:rust_test_helpers

DEPS_syntax := std term serialize log arena libc rustc_bitflags
DEPS_syntax := std term serialize log arena libc rustc_bitflags rustc_unicode
DEPS_syntax_ext := syntax fmt_macros

DEPS_rustc := syntax fmt_macros flate arena serialize getopts rbml rustc_front\
Expand Down
4 changes: 2 additions & 2 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
derived = load_properties("DerivedCoreProperties.txt", want_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
norm_props = load_properties("DerivedNormalizationProps.txt",
["Full_Composition_Exclusion"])

Expand All @@ -408,7 +408,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
# category tables
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
("derived_property", derived, want_derived), \
("property", props, ["White_Space"]):
("property", props, ["White_Space", "Pattern_White_Space"]):
emit_property_module(rf, name, cat, pfuns)

# normalizations and conversions module
Expand Down
5 changes: 5 additions & 0 deletions src/librustc_unicode/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,8 @@ pub mod str {
pub mod derived_property {
pub use tables::derived_property::{Cased, Case_Ignorable};
}

// For use in libsyntax
pub mod property {
pub use tables::property::Pattern_White_Space;
}
9 changes: 9 additions & 0 deletions src/librustc_unicode/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,15 @@ pub mod derived_property {
}

pub mod property {
pub const Pattern_White_Space_table: &'static [(char, char)] = &[
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'),
('\u{2028}', '\u{2029}')
];

pub fn Pattern_White_Space(c: char) -> bool {
super::bsearch_range_table(c, Pattern_White_Space_table)
}

pub const White_Space_table: &'static [(char, char)] = &[
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'),
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}',
Expand Down
1 change: 1 addition & 0 deletions src/libsyntax/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ extern crate term;
extern crate libc;
#[macro_use] extern crate log;
#[macro_use] #[no_link] extern crate rustc_bitflags;
extern crate rustc_unicode;

extern crate serialize as rustc_serialize; // used by deriving

Expand Down
4 changes: 2 additions & 2 deletions src/libsyntax/parse/lexer/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
use errors;
use parse::lexer::is_block_doc_comment;
use parse::lexer::{StringReader, TokenAndSpan};
use parse::lexer::{is_whitespace, Reader};
use parse::lexer::{is_pattern_whitespace, Reader};
use parse::lexer;
use print::pprust;
use str::char_at;
Expand Down Expand Up @@ -153,7 +153,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec<Comment>) {
}

fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec<Comment>) {
while is_whitespace(rdr.curr) && !rdr.is_eof() {
while is_pattern_whitespace(rdr.curr) && !rdr.is_eof() {
if rdr.col == CharPos(0) && rdr.curr_is('\n') {
push_blank_line_comment(rdr, &mut *comments);
}
Expand Down
20 changes: 10 additions & 10 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use ext::tt::transcribe::tt_next_token;
use parse::token::str_to_ident;
use parse::token;
use str::char_at;
use rustc_unicode::property::Pattern_White_Space;

use std::borrow::Cow;
use std::char;
Expand Down Expand Up @@ -546,10 +547,10 @@ impl<'a> StringReader<'a> {
let c = self.scan_comment();
debug!("scanning a comment {:?}", c);
c
}
c if is_whitespace(Some(c)) => {
},
c if is_pattern_whitespace(Some(c)) => {
let start_bpos = self.last_pos;
while is_whitespace(self.curr) {
while is_pattern_whitespace(self.curr) {
self.bump();
}
let c = Some(TokenAndSpan {
Expand Down Expand Up @@ -1435,7 +1436,7 @@ impl<'a> StringReader<'a> {
}

fn consume_whitespace(&mut self) {
while is_whitespace(self.curr) && !self.is_eof() {
while is_pattern_whitespace(self.curr) && !self.is_eof() {
self.bump();
}
}
Expand All @@ -1460,7 +1461,7 @@ impl<'a> StringReader<'a> {
}

fn consume_non_eol_whitespace(&mut self) {
while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
self.bump();
}
}
Expand Down Expand Up @@ -1591,11 +1592,10 @@ impl<'a> StringReader<'a> {
}
}

pub fn is_whitespace(c: Option<char>) -> bool {
match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace
' ' | '\n' | '\t' | '\r' => true,
_ => false,
}
// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
pub fn is_pattern_whitespace(c: Option<char>) -> bool {
c.map_or(false, Pattern_White_Space)
}

fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
Expand Down
108 changes: 59 additions & 49 deletions src/libsyntax/util/parser_testing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@

use ast;
use parse::{ParseSess,PResult,filemap_to_tts};
use parse::new_parser_from_source_str;
use parse::{lexer, new_parser_from_source_str};
use parse::parser::Parser;
use parse::token;
use ptr::P;
use str::char_at;
use std::iter::Peekable;

/// Map a string to tts, using a made-up filename:
pub fn string_to_tts(source_str: String) -> Vec<ast::TokenTree> {
Expand Down Expand Up @@ -87,69 +87,62 @@ pub fn strs_to_idents(ids: Vec<&str> ) -> Vec<ast::Ident> {

/// Does the given string match the pattern? whitespace in the first string
/// may be deleted or replaced with other whitespace to match the pattern.
/// this function is Unicode-ignorant; fortunately, the careful design of
/// UTF-8 mitigates this ignorance. In particular, this function only collapses
/// sequences of \n, \r, ' ', and \t, but it should otherwise tolerate Unicode
/// chars. Unsurprisingly, it doesn't do NKF-normalization(?).
/// This function is relatively Unicode-ignorant; fortunately, the careful design
/// of UTF-8 mitigates this ignorance. It doesn't do NKF-normalization(?).
pub fn matches_codepattern(a : &str, b : &str) -> bool {
let mut idx_a = 0;
let mut idx_b = 0;
let mut a_iter = a.chars().peekable();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should leave the rest of this comment in. The function is more unicode-aware now, but presumably still doesn't do any normalisation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/// This function is relatively Unicode-ignorant; fortunately, the careful design
/// of UTF-8 mitigates this ignorance. It doesn't do NKF-normalization(?).

Would be ok for you? Or just the normalization sentence?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That looks fine to me.

let mut b_iter = b.chars().peekable();

loop {
if idx_a == a.len() && idx_b == b.len() {
return true;
}
else if idx_a == a.len() {return false;}
else if idx_b == b.len() {
// maybe the stuff left in a is all ws?
if is_whitespace(char_at(a, idx_a)) {
return scan_for_non_ws_or_end(a,idx_a) == a.len();
} else {
return false;
let (a, b) = match (a_iter.peek(), b_iter.peek()) {
(None, None) => return true,
(None, _) => return false,
(Some(&a), None) => {
if is_pattern_whitespace(a) {
break // trailing whitespace check is out of loop for borrowck
} else {
return false
}
}
}
// ws in both given and pattern:
else if is_whitespace(char_at(a, idx_a))
&& is_whitespace(char_at(b, idx_b)) {
idx_a = scan_for_non_ws_or_end(a,idx_a);
idx_b = scan_for_non_ws_or_end(b,idx_b);
}
// ws in given only:
else if is_whitespace(char_at(a, idx_a)) {
idx_a = scan_for_non_ws_or_end(a,idx_a);
}
// *don't* silently eat ws in expected only.
else if char_at(a, idx_a) == char_at(b, idx_b) {
idx_a += 1;
idx_b += 1;
}
else {
return false;
(Some(&a), Some(&b)) => (a, b)
};

if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
// skip whitespace for a and b
scan_for_non_ws_or_end(&mut a_iter);
scan_for_non_ws_or_end(&mut b_iter);
} else if is_pattern_whitespace(a) {
// skip whitespace for a
scan_for_non_ws_or_end(&mut a_iter);
} else if a == b {
a_iter.next();
b_iter.next();
} else {
return false
}
}

// check if a has *only* trailing whitespace
a_iter.all(is_pattern_whitespace)
}

/// Given a string and an index, return the first usize >= idx
/// that is a non-ws-char or is outside of the legal range of
/// the string.
fn scan_for_non_ws_or_end(a : &str, idx: usize) -> usize {
let mut i = idx;
let len = a.len();
while (i < len) && (is_whitespace(char_at(a, i))) {
i += 1;
/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
fn scan_for_non_ws_or_end<I: Iterator<Item= char>>(iter: &mut Peekable<I>) {
while lexer::is_pattern_whitespace(iter.peek().cloned()) {
iter.next();
}
i
}

/// Copied from lexer.
pub fn is_whitespace(c: char) -> bool {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
pub fn is_pattern_whitespace(c: char) -> bool {
lexer::is_pattern_whitespace(Some(c))
}

#[cfg(test)]
mod tests {
use super::*;

#[test] fn eqmodws() {
#[test]
fn eqmodws() {
assert_eq!(matches_codepattern("",""),true);
assert_eq!(matches_codepattern("","a"),false);
assert_eq!(matches_codepattern("a",""),false);
Expand All @@ -160,5 +153,22 @@ mod tests {
assert_eq!(matches_codepattern("a b","a b"),true);
assert_eq!(matches_codepattern("ab","a b"),false);
assert_eq!(matches_codepattern("a b","ab"),true);
assert_eq!(matches_codepattern(" a b","ab"),true);
}

#[test]
fn pattern_whitespace() {
assert_eq!(matches_codepattern("","\x0C"), false);
assert_eq!(matches_codepattern("a b ","a \u{0085}\n\t\r b"),true);
assert_eq!(matches_codepattern("a b","a \u{0085}\n\t\r b "),false);
}

#[test]
fn non_pattern_whitespace() {
// These have the property 'White_Space' but not 'Pattern_White_Space'
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
assert_eq!(matches_codepattern("\u{205F}a b","ab"), false);
assert_eq!(matches_codepattern("a \u{3000}b","ab"), false);
}
}
22 changes: 22 additions & 0 deletions src/test/run-pass/parser-unicode-whitespace.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.


// Beware editing: it has numerous whitespace characters which are important.
// It contains one ranges from the 'PATTERN_WHITE_SPACE' property outlined in
// http://unicode.org/Public/UNIDATA/PropList.txt
//
// The characters in the first expression of the assertion can be generated
// from: "4\u{0C}+\n\t\r7\t*\u{20}2\u{85}/\u{200E}3\u{200F}*\u{2028}2\u{2029}"
pub fn main() {
assert_eq!(4 +

7 * 2…/‎3‏*
2
, 4 + 7 * 2 / 3 * 2);
}