Skip to content

Commit

Permalink
Auto merge of #34485 - tbu-:pr_unicode_debug_str, r=alexcrichton
Browse files Browse the repository at this point in the history
Escape fewer Unicode codepoints in `Debug` impl of `str`

Use the same procedure as Python to determine whether a character is
printable, described in [PEP 3138]. In particular, this means that the
following character classes are escaped:

- Cc (Other, Control)
- Cf (Other, Format)
- Cs (Other, Surrogate), even though they can't appear in Rust strings
- Co (Other, Private Use)
- Cn (Other, Not Assigned)
- Zl (Separator, Line)
- Zp (Separator, Paragraph)
- Zs (Separator, Space), except for the ASCII space `' '` `0x20`

This allows for user-friendly inspection of strings that are not
English (e.g. compare `"\u{e9}\u{e8}\u{ea}"` to `"éèê"`).

Fixes #34318.
CC #34422.

[PEP 3138]: https://www.python.org/dev/peps/pep-3138/
  • Loading branch information
bors authored Jul 28, 2016
2 parents 748ecb1 + 3d09b4a commit d1df3fe
Show file tree
Hide file tree
Showing 16 changed files with 1,032 additions and 29 deletions.
154 changes: 154 additions & 0 deletions src/etc/char_private.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python
#
# Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# This script uses the following Unicode tables:
# - Categories.txt

import os
import subprocess

def to_ranges(iter):
current = None
for i in iter:
if current is None or i != current[1] or i in (0x10000, 0x20000):
if current is not None:
yield tuple(current)
current = [i, i + 1]
else:
current[1] += 1
if current is not None:
yield tuple(current)

def get_escaped(dictionary):
for i in range(0x110000):
if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
yield i

def get_file(f):
try:
return open(os.path.basename(f))
except FileNotFoundError:
subprocess.run(["curl", "-O", f], check=True)
return open(os.path.basename(f))

def main():
file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")

dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}

CUTOFF=0x10000
singletons0 = []
singletons1 = []
normal0 = []
normal1 = []
extra = []

for a, b in to_ranges(get_escaped(dictionary)):
if a > 2 * CUTOFF:
extra.append((a, b - a))
elif a == b - 1:
if a & CUTOFF:
singletons1.append(a & ~CUTOFF)
else:
singletons0.append(a)
elif a == b - 2:
if a & CUTOFF:
singletons1.append(a & ~CUTOFF)
singletons1.append((a + 1) & ~CUTOFF)
else:
singletons0.append(a)
singletons0.append(a + 1)
else:
if a >= 2 * CUTOFF:
extra.append((a, b - a))
elif a & CUTOFF:
normal1.append((a & ~CUTOFF, b - a))
else:
normal0.append((a, b - a))

print("""\
// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "src/etc/char_private.py",
// do not edit directly!
use slice::SliceExt;
fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool {
for &s in singletons {
if x == s {
return false;
} else if x < s {
break;
}
}
for w in normal.chunks(2) {
let start = w[0];
let len = w[1];
let difference = (x as i32) - (start as i32);
if 0 <= difference {
if difference < len as i32 {
return false;
}
} else {
break;
}
}
true
}
pub fn is_printable(x: char) -> bool {
let x = x as u32;
let lower = x as u16;
if x < 0x10000 {
check(lower, SINGLETONS0, NORMAL0)
} else if x < 0x20000 {
check(lower, SINGLETONS1, NORMAL1)
} else {\
""")
for a, b in extra:
print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
print(" return false;")
print(" }")
print("""\
true
}
}\
""")
print()
print("const SINGLETONS0: &'static [u16] = &[")
for s in singletons0:
print(" 0x{:x},".format(s))
print("];")
print("const SINGLETONS1: &'static [u16] = &[")
for s in singletons1:
print(" 0x{:x},".format(s))
print("];")
print("const NORMAL0: &'static [u16] = &[")
for a, b in normal0:
print(" 0x{:x}, 0x{:x},".format(a, b))
print("];")
print("const NORMAL1: &'static [u16] = &[")
for a, b in normal1:
print(" 0x{:x}, 0x{:x},".format(a, b))
print("];")

if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions src/libcollections/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#![feature(allow_internal_unstable)]
#![feature(box_patterns)]
#![feature(box_syntax)]
#![cfg_attr(not(test), feature(char_escape_debug))]
#![feature(core_intrinsics)]
#![feature(dropck_parametricity)]
#![feature(fmt_internals)]
Expand Down
8 changes: 8 additions & 0 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1697,6 +1697,14 @@ impl str {
return s;
}

/// Escapes each char in `s` with `char::escape_debug`.
#[unstable(feature = "str_escape",
reason = "return type may change to be an iterator",
issue = "27791")]
pub fn escape_debug(&self) -> String {
self.chars().flat_map(|c| c.escape_debug()).collect()
}

/// Escapes each char in `s` with `char::escape_default`.
#[unstable(feature = "str_escape",
reason = "return type may change to be an iterator",
Expand Down
20 changes: 18 additions & 2 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -703,16 +703,32 @@ fn test_escape_unicode() {
assert_eq!("\u{1d4ea}\r".escape_unicode(), "\\u{1d4ea}\\u{d}");
}

#[test]
fn test_escape_debug() {
assert_eq!("abc".escape_debug(), "abc");
assert_eq!("a c".escape_debug(), "a c");
assert_eq!("éèê".escape_debug(), "éèê");
assert_eq!("\r\n\t".escape_debug(), "\\r\\n\\t");
assert_eq!("'\"\\".escape_debug(), "\\'\\\"\\\\");
assert_eq!("\u{7f}\u{ff}".escape_debug(), "\\u{7f}\u{ff}");
assert_eq!("\u{100}\u{ffff}".escape_debug(), "\u{100}\\u{ffff}");
assert_eq!("\u{10000}\u{10ffff}".escape_debug(), "\u{10000}\\u{10ffff}");
assert_eq!("ab\u{200b}".escape_debug(), "ab\\u{200b}");
assert_eq!("\u{10d4ea}\r".escape_debug(), "\\u{10d4ea}\\r");
}

#[test]
fn test_escape_default() {
assert_eq!("abc".escape_default(), "abc");
assert_eq!("a c".escape_default(), "a c");
assert_eq!("éèê".escape_default(), "\\u{e9}\\u{e8}\\u{ea}");
assert_eq!("\r\n\t".escape_default(), "\\r\\n\\t");
assert_eq!("'\"\\".escape_default(), "\\'\\\"\\\\");
assert_eq!("\u{7f}\u{ff}".escape_default(), "\\u{7f}\\u{ff}");
assert_eq!("\u{100}\u{ffff}".escape_default(), "\\u{100}\\u{ffff}");
assert_eq!("\u{10000}\u{10ffff}".escape_default(), "\\u{10000}\\u{10ffff}");
assert_eq!("ab\u{fb00}".escape_default(), "ab\\u{fb00}");
assert_eq!("\u{1d4ea}\r".escape_default(), "\\u{1d4ea}\\r");
assert_eq!("ab\u{200b}".escape_default(), "ab\\u{200b}");
assert_eq!("\u{10d4ea}\r".escape_default(), "\\u{10d4ea}\\r");
}

#[test]
Expand Down
37 changes: 37 additions & 0 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use prelude::v1::*;

use char_private::is_printable;
use mem::transmute;

// UTF-8 ranges and tags for encoding characters
Expand Down Expand Up @@ -263,6 +264,8 @@ pub trait CharExt {
fn escape_unicode(self) -> EscapeUnicode;
#[stable(feature = "core", since = "1.6.0")]
fn escape_default(self) -> EscapeDefault;
#[unstable(feature = "char_escape_debug", issue = "35068")]
fn escape_debug(self) -> EscapeDebug;
#[stable(feature = "core", since = "1.6.0")]
fn len_utf8(self) -> usize;
#[stable(feature = "core", since = "1.6.0")]
Expand Down Expand Up @@ -326,6 +329,19 @@ impl CharExt for char {
EscapeDefault { state: init_state }
}

#[inline]
fn escape_debug(self) -> EscapeDebug {
let init_state = match self {
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
c if is_printable(c) => EscapeDefaultState::Char(c),
c => EscapeDefaultState::Unicode(c.escape_unicode()),
};
EscapeDebug(EscapeDefault { state: init_state })
}

#[inline]
fn len_utf8(self) -> usize {
let code = self as u32;
Expand Down Expand Up @@ -600,6 +616,27 @@ impl ExactSizeIterator for EscapeDefault {
}
}

/// An iterator that yields the literal escape code of a `char`.
///
/// This `struct` is created by the [`escape_debug()`] method on [`char`]. See its
/// documentation for more.
///
/// [`escape_debug()`]: ../../std/primitive.char.html#method.escape_debug
/// [`char`]: ../../std/primitive.char.html
#[unstable(feature = "char_escape_debug", issue = "35068")]
#[derive(Clone, Debug)]
pub struct EscapeDebug(EscapeDefault);

#[unstable(feature = "char_escape_debug", issue = "35068")]
impl Iterator for EscapeDebug {
type Item = char;
fn next(&mut self) -> Option<char> { self.0.next() }
fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() }
}

#[unstable(feature = "char_escape_debug", issue = "35068")]
impl ExactSizeIterator for EscapeDebug { }

/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
/// value.
///
Expand Down
Loading

0 comments on commit d1df3fe

Please sign in to comment.