Auto merge of #34485 - tbu-:pr_unicode_debug_str, r=alexcrichton

Escape fewer Unicode codepoints in `Debug` impl of `str` Use the same procedure as Python to determine whether a character is printable, described in [PEP 3138]. In particular, this means that the following character classes are escaped: - Cc (Other, Control) - Cf (Other, Format) - Cs (Other, Surrogate), even though they can't appear in Rust strings - Co (Other, Private Use) - Cn (Other, Not Assigned) - Zl (Separator, Line) - Zp (Separator, Paragraph) - Zs (Separator, Space), except for the ASCII space `' '` `0x20` This allows for user-friendly inspection of strings that are not English (e.g. compare `"\u{e9}\u{e8}\u{ea}"` to `"éèê"`). Fixes #34318. CC #34422. [PEP 3138]: https://www.python.org/dev/peps/pep-3138/
rust-lang · Jul 28, 2016 · d1df3fe · d1df3fe
2 parents 748ecb1 + 3d09b4a
commit d1df3fe
Show file tree

Hide file tree

Showing 16 changed files with 1,032 additions and 29 deletions.
diff --git a/src/etc/char_private.py b/src/etc/char_private.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+#
+# Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# This script uses the following Unicode tables:
+# - Categories.txt
+
+import os
+import subprocess
+
+def to_ranges(iter):
+    current = None
+    for i in iter:
+        if current is None or i != current[1] or i in (0x10000, 0x20000):
+            if current is not None:
+                yield tuple(current)
+            current = [i, i + 1]
+        else:
+            current[1] += 1
+    if current is not None:
+        yield tuple(current)
+
+def get_escaped(dictionary):
+    for i in range(0x110000):
+        if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
+            yield i
+
+def get_file(f):
+    try:
+        return open(os.path.basename(f))
+    except FileNotFoundError:
+        subprocess.run(["curl", "-O", f], check=True)
+        return open(os.path.basename(f))
+
+def main():
+    file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
+
+    dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
+
+    CUTOFF=0x10000
+    singletons0 = []
+    singletons1 = []
+    normal0 = []
+    normal1 = []
+    extra = []
+
+    for a, b in to_ranges(get_escaped(dictionary)):
+        if a > 2 * CUTOFF:
+            extra.append((a, b - a))
+        elif a == b - 1:
+            if a & CUTOFF:
+                singletons1.append(a & ~CUTOFF)
+            else:
+                singletons0.append(a)
+        elif a == b - 2:
+            if a & CUTOFF:
+                singletons1.append(a & ~CUTOFF)
+                singletons1.append((a + 1) & ~CUTOFF)
+            else:
+                singletons0.append(a)
+                singletons0.append(a + 1)
+        else:
+            if a >= 2 * CUTOFF:
+                extra.append((a, b - a))
+            elif a & CUTOFF:
+                normal1.append((a & ~CUTOFF, b - a))
+            else:
+                normal0.append((a, b - a))
+
+    print("""\
+// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// NOTE: The following code was generated by "src/etc/char_private.py",
+//       do not edit directly!
+
+use slice::SliceExt;
+
+fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool {
+    for &s in singletons {
+        if x == s {
+            return false;
+        } else if x < s {
+            break;
+        }
+    }
+    for w in normal.chunks(2) {
+        let start = w[0];
+        let len = w[1];
+        let difference = (x as i32) - (start as i32);
+        if 0 <= difference {
+            if difference < len as i32 {
+                return false;
+            }
+        } else {
+            break;
+        }
+    }
+    true
+}
+
+pub fn is_printable(x: char) -> bool {
+    let x = x as u32;
+    let lower = x as u16;
+    if x < 0x10000 {
+        check(lower, SINGLETONS0, NORMAL0)
+    } else if x < 0x20000 {
+        check(lower, SINGLETONS1, NORMAL1)
+    } else {\
+""")
+    for a, b in extra:
+        print("        if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
+        print("            return false;")
+        print("        }")
+    print("""\
+        true
+    }
+}\
+""")
+    print()
+    print("const SINGLETONS0: &'static [u16] = &[")
+    for s in singletons0:
+        print("    0x{:x},".format(s))
+    print("];")
+    print("const SINGLETONS1: &'static [u16] = &[")
+    for s in singletons1:
+        print("    0x{:x},".format(s))
+    print("];")
+    print("const NORMAL0: &'static [u16] = &[")
+    for a, b in normal0:
+        print("    0x{:x}, 0x{:x},".format(a, b))
+    print("];")
+    print("const NORMAL1: &'static [u16] = &[")
+    for a, b in normal1:
+        print("    0x{:x}, 0x{:x},".format(a, b))
+    print("];")
+
+if __name__ == '__main__':
+    main()
diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs
@@ -33,6 +33,7 @@
 #![feature(allow_internal_unstable)]
 #![feature(box_patterns)]
 #![feature(box_syntax)]
+#![cfg_attr(not(test), feature(char_escape_debug))]
 #![feature(core_intrinsics)]
 #![feature(dropck_parametricity)]
 #![feature(fmt_internals)]

diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs
@@ -1697,6 +1697,14 @@ impl str {
         return s;
     }
 
+    /// Escapes each char in `s` with `char::escape_debug`.
+    #[unstable(feature = "str_escape",
+               reason = "return type may change to be an iterator",
+               issue = "27791")]
+    pub fn escape_debug(&self) -> String {
+        self.chars().flat_map(|c| c.escape_debug()).collect()
+    }
+
     /// Escapes each char in `s` with `char::escape_default`.
     #[unstable(feature = "str_escape",
                reason = "return type may change to be an iterator",

diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs
@@ -703,16 +703,32 @@ fn test_escape_unicode() {
     assert_eq!("\u{1d4ea}\r".escape_unicode(), "\\u{1d4ea}\\u{d}");
 }
 
+#[test]
+fn test_escape_debug() {
+    assert_eq!("abc".escape_debug(), "abc");
+    assert_eq!("a c".escape_debug(), "a c");
+    assert_eq!("éèê".escape_debug(), "éèê");
+    assert_eq!("\r\n\t".escape_debug(), "\\r\\n\\t");
+    assert_eq!("'\"\\".escape_debug(), "\\'\\\"\\\\");
+    assert_eq!("\u{7f}\u{ff}".escape_debug(), "\\u{7f}\u{ff}");
+    assert_eq!("\u{100}\u{ffff}".escape_debug(), "\u{100}\\u{ffff}");
+    assert_eq!("\u{10000}\u{10ffff}".escape_debug(), "\u{10000}\\u{10ffff}");
+    assert_eq!("ab\u{200b}".escape_debug(), "ab\\u{200b}");
+    assert_eq!("\u{10d4ea}\r".escape_debug(), "\\u{10d4ea}\\r");
+}
+
 #[test]
 fn test_escape_default() {
     assert_eq!("abc".escape_default(), "abc");
     assert_eq!("a c".escape_default(), "a c");
+    assert_eq!("éèê".escape_default(), "\\u{e9}\\u{e8}\\u{ea}");
     assert_eq!("\r\n\t".escape_default(), "\\r\\n\\t");
     assert_eq!("'\"\\".escape_default(), "\\'\\\"\\\\");
+    assert_eq!("\u{7f}\u{ff}".escape_default(), "\\u{7f}\\u{ff}");
     assert_eq!("\u{100}\u{ffff}".escape_default(), "\\u{100}\\u{ffff}");
     assert_eq!("\u{10000}\u{10ffff}".escape_default(), "\\u{10000}\\u{10ffff}");
-    assert_eq!("ab\u{fb00}".escape_default(), "ab\\u{fb00}");
-    assert_eq!("\u{1d4ea}\r".escape_default(), "\\u{1d4ea}\\r");
+    assert_eq!("ab\u{200b}".escape_default(), "ab\\u{200b}");
+    assert_eq!("\u{10d4ea}\r".escape_default(), "\\u{10d4ea}\\r");
 }
 
 #[test]

diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -17,6 +17,7 @@
 
 use prelude::v1::*;
 
+use char_private::is_printable;
 use mem::transmute;
 
 // UTF-8 ranges and tags for encoding characters
@@ -263,6 +264,8 @@ pub trait CharExt {
     fn escape_unicode(self) -> EscapeUnicode;
     #[stable(feature = "core", since = "1.6.0")]
     fn escape_default(self) -> EscapeDefault;
+    #[unstable(feature = "char_escape_debug", issue = "35068")]
+    fn escape_debug(self) -> EscapeDebug;
     #[stable(feature = "core", since = "1.6.0")]
     fn len_utf8(self) -> usize;
     #[stable(feature = "core", since = "1.6.0")]
@@ -326,6 +329,19 @@ impl CharExt for char {
         EscapeDefault { state: init_state }
     }
 
+    #[inline]
+    fn escape_debug(self) -> EscapeDebug {
+        let init_state = match self {
+            '\t' => EscapeDefaultState::Backslash('t'),
+            '\r' => EscapeDefaultState::Backslash('r'),
+            '\n' => EscapeDefaultState::Backslash('n'),
+            '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
+            c if is_printable(c) => EscapeDefaultState::Char(c),
+            c => EscapeDefaultState::Unicode(c.escape_unicode()),
+        };
+        EscapeDebug(EscapeDefault { state: init_state })
+    }
+
     #[inline]
     fn len_utf8(self) -> usize {
         let code = self as u32;
@@ -600,6 +616,27 @@ impl ExactSizeIterator for EscapeDefault {
     }
 }
 
+/// An iterator that yields the literal escape code of a `char`.
+///
+/// This `struct` is created by the [`escape_debug()`] method on [`char`]. See its
+/// documentation for more.
+///
+/// [`escape_debug()`]: ../../std/primitive.char.html#method.escape_debug
+/// [`char`]: ../../std/primitive.char.html
+#[unstable(feature = "char_escape_debug", issue = "35068")]
+#[derive(Clone, Debug)]
+pub struct EscapeDebug(EscapeDefault);
+
+#[unstable(feature = "char_escape_debug", issue = "35068")]
+impl Iterator for EscapeDebug {
+    type Item = char;
+    fn next(&mut self) -> Option<char> { self.0.next() }
+    fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() }
+}
+
+#[unstable(feature = "char_escape_debug", issue = "35068")]
+impl ExactSizeIterator for EscapeDebug { }
+
 /// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
 /// value.
 ///