rollup merge of rust-lang#23625: fhahn/issue-23620-ice-unicode-bytest…

…ring closes rust-lang#23620 This PR patches the issue mentioned in rust-lang#23620, but there is also an ICE for invalid escape sequences in byte literals. This is due to the fact that the `scan_byte` function returns ` token::intern("??") ` for invalid bytes, resulting in an ICE later on. Is there a reason for this behavior? Shouldn't `scan_byte` fail when it encounters an invalid byte? And I noticed a small inconsistency in the documentation. According to the formal byte literal definition in http://doc.rust-lang.org/reference.html#byte-and-byte-string-literals , a byte string literal contains `string_body *`, but according to the text (and the behavior of the lexer) it should not accept unicode escape sequences. Hence it should be replaced by `byte_body *`. If this is valid, I can add this fix to this PR.
alexcrichton · Mar 27, 2015 · b79fbe0 · b79fbe0
2 parents e42521a + afaa3b6
commit b79fbe0
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 12 deletions.
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
@@ -742,6 +742,7 @@ impl<'a> StringReader<'a> {
         let start_bpos = self.last_pos;
         let mut accum_int = 0;
 
+        let mut valid = true;
         for _ in 0..n_digits {
             if self.is_eof() {
                 let last_bpos = self.last_pos;
@@ -750,13 +751,16 @@ impl<'a> StringReader<'a> {
             if self.curr_is(delim) {
                 let last_bpos = self.last_pos;
                 self.err_span_(start_bpos, last_bpos, "numeric character escape is too short");
+                valid = false;
                 break;
             }
             let c = self.curr.unwrap_or('\x00');
             accum_int *= 16;
             accum_int += c.to_digit(16).unwrap_or_else(|| {
                 self.err_span_char(self.last_pos, self.pos,
                               "illegal character in numeric character escape", c);
+
+                valid = false;
                 0
             });
             self.bump();
@@ -767,10 +771,11 @@ impl<'a> StringReader<'a> {
                            self.last_pos,
                            "this form of character escape may only be used \
                             with characters in the range [\\x00-\\x7f]");
+            valid = false;
         }
 
         match char::from_u32(accum_int) {
-            Some(_) => true,
+            Some(_) => valid,
             None => {
                 let last_bpos = self.last_pos;
                 self.err_span_(start_bpos, last_bpos, "illegal numeric character escape");
@@ -799,7 +804,18 @@ impl<'a> StringReader<'a> {
                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
                             'x' => self.scan_byte_escape(delim, !ascii_only),
                             'u' if self.curr_is('{') => {
-                                self.scan_unicode_escape(delim)
+                            let valid = self.scan_unicode_escape(delim);
+                            if valid && ascii_only {
+                                self.err_span_(
+                                    escaped_pos,
+                                    self.last_pos,
+                                    "unicode escape sequences cannot be used as a byte or in \
+                                    a byte string"
+                                );
+                                false
+                            } else {
+                               valid
+                            }
                             }
                             '\n' if delim == '"' => {
                                 self.consume_whitespace();
@@ -869,6 +885,7 @@ impl<'a> StringReader<'a> {
         let start_bpos = self.last_pos;
         let mut count = 0;
         let mut accum_int = 0;
+        let mut valid = true;
 
         while !self.curr_is('}') && count <= 6 {
             let c = match self.curr {
@@ -884,29 +901,30 @@ impl<'a> StringReader<'a> {
                     self.fatal_span_(self.last_pos, self.pos,
                                      "unterminated unicode escape (needed a `}`)");
                 } else {
-                    self.fatal_span_char(self.last_pos, self.pos,
+                    self.err_span_char(self.last_pos, self.pos,
                                    "illegal character in unicode escape", c);
                 }
+                valid = false;
+                0
             });
             self.bump();
             count += 1;
         }
 
         if count > 6 {
-            self.fatal_span_(start_bpos, self.last_pos,
+            self.err_span_(start_bpos, self.last_pos,
                           "overlong unicode escape (can have at most 6 hex digits)");
+            valid = false;
         }
 
         self.bump(); // past the ending }
 
-        let mut valid = count >= 1 && count <= 6;
-        if char::from_u32(accum_int).is_none() {
-            valid = false;
+        if valid && (char::from_u32(accum_int).is_none() || count == 0) {
+            self.err_span_(start_bpos, self.last_pos, "illegal unicode character escape");
+            valid= false;
         }
 
-        if !valid {
-            self.fatal_span_(start_bpos, self.last_pos, "illegal unicode character escape");
-        }
+
         valid
     }
 
@@ -1330,7 +1348,7 @@ impl<'a> StringReader<'a> {
                 "unterminated byte constant".to_string());
         }
 
-        let id = if valid { self.name_from(start) } else { token::intern("??") };
+        let id = if valid { self.name_from(start) } else { token::intern("?") };
         self.bump(); // advance curr past token
         return token::Byte(id);
     }

diff --git a/src/test/parse-fail/issue-23620-invalid-escapes.rs b/src/test/parse-fail/issue-23620-invalid-escapes.rs
@@ -0,0 +1,45 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+fn main() {
+    let _ = b"\u{a66e}";
+    //~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
+
+    let _ = b'\u{a66e}';
+    //~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
+
+    let _ = b'\u';
+    //~^ ERROR unknown byte escape: u
+
+    let _ = b'\x5';
+    //~^ ERROR numeric character escape is too short
+
+    let _ = b'\xxy';
+    //~^ ERROR illegal character in numeric character escape: x
+    //~^^ ERROR illegal character in numeric character escape: y
+
+    let _ = '\x5';
+    //~^ ERROR numeric character escape is too short
+
+    let _ = '\xxy';
+    //~^ ERROR illegal character in numeric character escape: x
+    //~^^ ERROR illegal character in numeric character escape: y
+
+    let _ = b"\u{a4a4} \xf \u";
+    //~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
+    //~^^ ERROR illegal character in numeric character escape:
+    //~^^^ ERROR unknown byte escape: u
+
+    let _ = "\u{ffffff} \xf \u";
+    //~^ ERROR illegal unicode character escape
+    //~^^ ERROR illegal character in numeric character escape:
+    //~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
+    //~^^^^ ERROR unknown character escape: u
+}
diff --git a/src/test/parse-fail/new-unicode-escapes-4.rs b/src/test/parse-fail/new-unicode-escapes-4.rs
@@ -9,5 +9,8 @@
 // except according to those terms.
 
 pub fn main() {
-    let s = "\u{lol}"; //~ ERROR illegal character in unicode escape
+    let s = "\u{lol}";
+     //~^ ERROR illegal character in unicode escape: l
+     //~^^ ERROR illegal character in unicode escape: o
+     //~^^^ ERROR illegal character in unicode escape: l
 }