Skip to content

Commit

Permalink
rollup merge of rust-lang#23625: fhahn/issue-23620-ice-unicode-bytest…
Browse files Browse the repository at this point in the history
…ring

closes rust-lang#23620

This PR patches the issue mentioned in rust-lang#23620, but there is also an ICE for invalid escape sequences in byte literals. This is due to the fact that the `scan_byte` function returns ` token::intern("??") ` for invalid bytes, resulting in an ICE later on. Is there a reason for this behavior? Shouldn't `scan_byte` fail when it encounters an invalid byte?

And I noticed a small inconsistency in the documentation. According to the formal byte literal definition in http://doc.rust-lang.org/reference.html#byte-and-byte-string-literals , a byte string literal contains `string_body *`, but according to the text (and the behavior of the lexer) it should not accept unicode escape sequences. Hence it should be replaced by `byte_body *`. If this is valid, I can add this fix to this PR.
  • Loading branch information
alexcrichton committed Mar 27, 2015
2 parents e42521a + afaa3b6 commit b79fbe0
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 12 deletions.
40 changes: 29 additions & 11 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,7 @@ impl<'a> StringReader<'a> {
let start_bpos = self.last_pos;
let mut accum_int = 0;

let mut valid = true;
for _ in 0..n_digits {
if self.is_eof() {
let last_bpos = self.last_pos;
Expand All @@ -750,13 +751,16 @@ impl<'a> StringReader<'a> {
if self.curr_is(delim) {
let last_bpos = self.last_pos;
self.err_span_(start_bpos, last_bpos, "numeric character escape is too short");
valid = false;
break;
}
let c = self.curr.unwrap_or('\x00');
accum_int *= 16;
accum_int += c.to_digit(16).unwrap_or_else(|| {
self.err_span_char(self.last_pos, self.pos,
"illegal character in numeric character escape", c);

valid = false;
0
});
self.bump();
Expand All @@ -767,10 +771,11 @@ impl<'a> StringReader<'a> {
self.last_pos,
"this form of character escape may only be used \
with characters in the range [\\x00-\\x7f]");
valid = false;
}

match char::from_u32(accum_int) {
Some(_) => true,
Some(_) => valid,
None => {
let last_bpos = self.last_pos;
self.err_span_(start_bpos, last_bpos, "illegal numeric character escape");
Expand Down Expand Up @@ -799,7 +804,18 @@ impl<'a> StringReader<'a> {
'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
'x' => self.scan_byte_escape(delim, !ascii_only),
'u' if self.curr_is('{') => {
self.scan_unicode_escape(delim)
let valid = self.scan_unicode_escape(delim);
if valid && ascii_only {
self.err_span_(
escaped_pos,
self.last_pos,
"unicode escape sequences cannot be used as a byte or in \
a byte string"
);
false
} else {
valid
}
}
'\n' if delim == '"' => {
self.consume_whitespace();
Expand Down Expand Up @@ -869,6 +885,7 @@ impl<'a> StringReader<'a> {
let start_bpos = self.last_pos;
let mut count = 0;
let mut accum_int = 0;
let mut valid = true;

while !self.curr_is('}') && count <= 6 {
let c = match self.curr {
Expand All @@ -884,29 +901,30 @@ impl<'a> StringReader<'a> {
self.fatal_span_(self.last_pos, self.pos,
"unterminated unicode escape (needed a `}`)");
} else {
self.fatal_span_char(self.last_pos, self.pos,
self.err_span_char(self.last_pos, self.pos,
"illegal character in unicode escape", c);
}
valid = false;
0
});
self.bump();
count += 1;
}

if count > 6 {
self.fatal_span_(start_bpos, self.last_pos,
self.err_span_(start_bpos, self.last_pos,
"overlong unicode escape (can have at most 6 hex digits)");
valid = false;
}

self.bump(); // past the ending }

let mut valid = count >= 1 && count <= 6;
if char::from_u32(accum_int).is_none() {
valid = false;
if valid && (char::from_u32(accum_int).is_none() || count == 0) {
self.err_span_(start_bpos, self.last_pos, "illegal unicode character escape");
valid= false;
}

if !valid {
self.fatal_span_(start_bpos, self.last_pos, "illegal unicode character escape");
}

valid
}

Expand Down Expand Up @@ -1330,7 +1348,7 @@ impl<'a> StringReader<'a> {
"unterminated byte constant".to_string());
}

let id = if valid { self.name_from(start) } else { token::intern("??") };
let id = if valid { self.name_from(start) } else { token::intern("?") };
self.bump(); // advance curr past token
return token::Byte(id);
}
Expand Down
45 changes: 45 additions & 0 deletions src/test/parse-fail/issue-23620-invalid-escapes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

fn main() {
let _ = b"\u{a66e}";
//~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string

let _ = b'\u{a66e}';
//~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string

let _ = b'\u';
//~^ ERROR unknown byte escape: u

let _ = b'\x5';
//~^ ERROR numeric character escape is too short

let _ = b'\xxy';
//~^ ERROR illegal character in numeric character escape: x
//~^^ ERROR illegal character in numeric character escape: y

let _ = '\x5';
//~^ ERROR numeric character escape is too short

let _ = '\xxy';
//~^ ERROR illegal character in numeric character escape: x
//~^^ ERROR illegal character in numeric character escape: y

let _ = b"\u{a4a4} \xf \u";
//~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
//~^^ ERROR illegal character in numeric character escape:
//~^^^ ERROR unknown byte escape: u

let _ = "\u{ffffff} \xf \u";
//~^ ERROR illegal unicode character escape
//~^^ ERROR illegal character in numeric character escape:
//~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
//~^^^^ ERROR unknown character escape: u
}
5 changes: 4 additions & 1 deletion src/test/parse-fail/new-unicode-escapes-4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@
// except according to those terms.

pub fn main() {
let s = "\u{lol}"; //~ ERROR illegal character in unicode escape
let s = "\u{lol}";
//~^ ERROR illegal character in unicode escape: l
//~^^ ERROR illegal character in unicode escape: o
//~^^^ ERROR illegal character in unicode escape: l
}

0 comments on commit b79fbe0

Please sign in to comment.