From 13a2dd96fe824cc5d61e94ed380db0114efdd014 Mon Sep 17 00:00:00 2001 From: tormol Date: Thu, 8 Sep 2016 13:54:39 +0200 Subject: [PATCH] [breaking-change] std: change `encode_utf{8,16}()` to take a buffer and return a slice They panic if the buffer is too small. --- src/libcollections/string.rs | 7 +- src/libcollectionstest/str.rs | 8 +- src/libcore/char.rs | 170 ++++++++++------------------------ src/libcore/fmt/mod.rs | 18 +--- src/libcoretest/char.rs | 21 +++-- src/librustc_unicode/char.rs | 92 +++++++++++++----- src/librustc_unicode/u_str.rs | 10 +- src/libserialize/json.rs | 4 +- src/libstd/sys/common/wtf8.rs | 21 +++-- 9 files changed, 163 insertions(+), 188 deletions(-) diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index cff0308d4af84..e4930ae357208 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -975,7 +975,7 @@ impl String { pub fn push(&mut self, ch: char) { match ch.len_utf8() { 1 => self.vec.push(ch as u8), - _ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()), + _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0;4]).as_bytes()), } } @@ -1131,10 +1131,11 @@ impl String { let len = self.len(); assert!(idx <= len); assert!(self.is_char_boundary(idx)); - let bits = ch.encode_utf8(); + let mut bits = [0; 4]; + let bits = ch.encode_utf8(&mut bits).as_bytes(); unsafe { - self.insert_bytes(idx, bits.as_slice()); + self.insert_bytes(idx, bits); } } diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 62e164a569aa6..560895f721bbf 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -786,9 +786,9 @@ fn test_rev_iterator() { #[test] fn test_chars_decoding() { + let mut bytes = [0; 4]; for c in (0..0x110000).filter_map(::std::char::from_u32) { - let bytes = c.encode_utf8(); - let s = ::std::str::from_utf8(bytes.as_slice()).unwrap(); + let s = c.encode_utf8(&mut bytes); if Some(c) != s.chars().next() { panic!("character {:x}={} does not decode correctly", c as u32, c); } @@ -797,9 +797,9 @@ fn test_chars_decoding() { #[test] fn test_chars_rev_decoding() { + let mut bytes = [0; 4]; for c in (0..0x110000).filter_map(::std::char::from_u32) { - let bytes = c.encode_utf8(); - let s = ::std::str::from_utf8(bytes.as_slice()).unwrap(); + let s = c.encode_utf8(&mut bytes); if Some(c) != s.chars().rev().next() { panic!("character {:x}={} does not decode correctly", c as u32, c); } diff --git a/src/libcore/char.rs b/src/libcore/char.rs index a21d1229d358b..26d28049a474d 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -18,6 +18,7 @@ use char_private::is_printable; use convert::TryFrom; use fmt; +use slice; use iter::FusedIterator; use mem::transmute; @@ -327,9 +328,9 @@ pub trait CharExt { #[stable(feature = "core", since = "1.6.0")] fn len_utf16(self) -> usize; #[unstable(feature = "unicode", issue = "27784")] - fn encode_utf8(self) -> EncodeUtf8; + fn encode_utf8(self, dst: &mut [u8]) -> &mut str; #[unstable(feature = "unicode", issue = "27784")] - fn encode_utf16(self) -> EncodeUtf16; + fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16]; } #[stable(feature = "core", since = "1.6.0")] @@ -419,47 +420,59 @@ impl CharExt for char { } #[inline] - fn encode_utf8(self) -> EncodeUtf8 { + fn encode_utf8(self, dst: &mut [u8]) -> &mut str { let code = self as u32; - let mut buf = [0; 4]; - let pos = if code < MAX_ONE_B { - buf[3] = code as u8; - 3 - } else if code < MAX_TWO_B { - buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - buf[3] = (code & 0x3F) as u8 | TAG_CONT; - 2 - } else if code < MAX_THREE_B { - buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - buf[3] = (code & 0x3F) as u8 | TAG_CONT; - 1 - } else { - buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; - buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - buf[3] = (code & 0x3F) as u8 | TAG_CONT; - 0 - }; - EncodeUtf8 { buf: buf, pos: pos } + unsafe { + let len = + if code < MAX_ONE_B && !dst.is_empty() { + *dst.get_unchecked_mut(0) = code as u8; + 1 + } else if code < MAX_TWO_B && dst.len() >= 2 { + *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; + 2 + } else if code < MAX_THREE_B && dst.len() >= 3 { + *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; + 3 + } else if dst.len() >= 4 { + *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; + 4 + } else { + panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + from_u32_unchecked(code).len_utf8(), + code, + dst.len()) + }; + transmute(slice::from_raw_parts_mut(dst.as_mut_ptr(), len)) + } } #[inline] - fn encode_utf16(self) -> EncodeUtf16 { - let mut buf = [0; 2]; + fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { let mut code = self as u32; - let pos = if (code & 0xFFFF) == code { - // The BMP falls through (assuming non-surrogate, as it should) - buf[1] = code as u16; - 1 - } else { - // Supplementary planes break into surrogates. - code -= 0x1_0000; - buf[0] = 0xD800 | ((code >> 10) as u16); - buf[1] = 0xDC00 | ((code as u16) & 0x3FF); - 0 - }; - EncodeUtf16 { buf: buf, pos: pos } + unsafe { + if (code & 0xFFFF) == code && !dst.is_empty() { + // The BMP falls through (assuming non-surrogate, as it should) + *dst.get_unchecked_mut(0) = code as u16; + slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + code -= 0x1_0000; + *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); + *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); + slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) + } else { + panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", + from_u32_unchecked(code).len_utf16(), + code, + dst.len()) + } + } } } @@ -702,88 +715,7 @@ impl ExactSizeIterator for EscapeDebug { } #[unstable(feature = "fused", issue = "35602")] impl FusedIterator for EscapeDebug {} -/// An iterator over `u8` entries represending the UTF-8 encoding of a `char` -/// value. -/// -/// Constructed via the `.encode_utf8()` method on `char`. -#[unstable(feature = "unicode", issue = "27784")] -#[derive(Debug)] -pub struct EncodeUtf8 { - buf: [u8; 4], - pos: usize, -} - -impl EncodeUtf8 { - /// Returns the remaining bytes of this iterator as a slice. - #[unstable(feature = "unicode", issue = "27784")] - pub fn as_slice(&self) -> &[u8] { - &self.buf[self.pos..] - } -} - -#[unstable(feature = "unicode", issue = "27784")] -impl Iterator for EncodeUtf8 { - type Item = u8; - - fn next(&mut self) -> Option { - if self.pos == self.buf.len() { - None - } else { - let ret = Some(self.buf[self.pos]); - self.pos += 1; - ret - } - } - - fn size_hint(&self) -> (usize, Option) { - self.as_slice().iter().size_hint() - } -} - -#[unstable(feature = "fused", issue = "35602")] -impl FusedIterator for EncodeUtf8 {} - -/// An iterator over `u16` entries represending the UTF-16 encoding of a `char` -/// value. -/// -/// Constructed via the `.encode_utf16()` method on `char`. -#[unstable(feature = "unicode", issue = "27784")] -#[derive(Debug)] -pub struct EncodeUtf16 { - buf: [u16; 2], - pos: usize, -} - -impl EncodeUtf16 { - /// Returns the remaining bytes of this iterator as a slice. - #[unstable(feature = "unicode", issue = "27784")] - pub fn as_slice(&self) -> &[u16] { - &self.buf[self.pos..] - } -} - - -#[unstable(feature = "unicode", issue = "27784")] -impl Iterator for EncodeUtf16 { - type Item = u16; - - fn next(&mut self) -> Option { - if self.pos == self.buf.len() { - None - } else { - let ret = Some(self.buf[self.pos]); - self.pos += 1; - ret - } - } - - fn size_hint(&self) -> (usize, Option) { - self.as_slice().iter().size_hint() - } -} -#[unstable(feature = "fused", issue = "35602")] -impl FusedIterator for EncodeUtf16 {} /// An iterator over an iterator of bytes of the characters the bytes represent /// as UTF-8 diff --git a/src/libcore/fmt/mod.rs b/src/libcore/fmt/mod.rs index 8342d663cdc7c..5d7f41556c25f 100644 --- a/src/libcore/fmt/mod.rs +++ b/src/libcore/fmt/mod.rs @@ -97,9 +97,7 @@ pub trait Write { /// This function will return an instance of `Error` on error. #[stable(feature = "fmt_write_char", since = "1.1.0")] fn write_char(&mut self, c: char) -> Result { - self.write_str(unsafe { - str::from_utf8_unchecked(c.encode_utf8().as_slice()) - }) + self.write_str(c.encode_utf8(&mut [0; 4])) } /// Glue for usage of the `write!` macro with implementors of this trait. @@ -924,9 +922,7 @@ impl<'a> Formatter<'a> { // Writes the sign if it exists, and then the prefix if it was requested let write_prefix = |f: &mut Formatter| { if let Some(c) = sign { - f.buf.write_str(unsafe { - str::from_utf8_unchecked(c.encode_utf8().as_slice()) - })?; + f.buf.write_str(c.encode_utf8(&mut [0; 4]))?; } if prefixed { f.buf.write_str(prefix) } else { Ok(()) } @@ -1032,10 +1028,8 @@ impl<'a> Formatter<'a> { rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2), }; - let fill = self.fill.encode_utf8(); - let fill = unsafe { - str::from_utf8_unchecked(fill.as_slice()) - }; + let mut fill = [0; 4]; + let fill = self.fill.encode_utf8(&mut fill); for _ in 0..pre_pad { self.buf.write_str(fill)?; @@ -1435,9 +1429,7 @@ impl Display for char { if f.width.is_none() && f.precision.is_none() { f.write_char(*self) } else { - f.pad(unsafe { - str::from_utf8_unchecked(self.encode_utf8().as_slice()) - }) + f.pad(self.encode_utf8(&mut [0; 4])) } } } diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index 199437a431eee..7da0b6902f271 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::char; +use std::{char,str}; use std::convert::TryFrom; #[test] @@ -248,10 +248,12 @@ fn test_escape_unicode() { #[test] fn test_encode_utf8() { fn check(input: char, expect: &[u8]) { - assert_eq!(input.encode_utf8().as_slice(), expect); - for (a, b) in input.encode_utf8().zip(expect) { - assert_eq!(a, *b); - } + let mut buf = [0; 4]; + let ptr = buf.as_ptr(); + let s = input.encode_utf8(&mut buf); + assert_eq!(s.as_ptr() as usize, ptr as usize); + assert!(str::from_utf8(s.as_bytes()).is_ok()); + assert_eq!(s.as_bytes(), expect); } check('x', &[0x78]); @@ -263,10 +265,11 @@ fn test_encode_utf8() { #[test] fn test_encode_utf16() { fn check(input: char, expect: &[u16]) { - assert_eq!(input.encode_utf16().as_slice(), expect); - for (a, b) in input.encode_utf16().zip(expect) { - assert_eq!(a, *b); - } + let mut buf = [0; 2]; + let ptr = buf.as_mut_ptr(); + let b = input.encode_utf16(&mut buf); + assert_eq!(b.as_mut_ptr() as usize, ptr as usize); + assert_eq!(b, expect); } check('x', &[0x0078]); diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs index 5a0c27d9c609f..702d7d8b4b2ca 100644 --- a/src/librustc_unicode/char.rs +++ b/src/librustc_unicode/char.rs @@ -37,7 +37,7 @@ use tables::{conversions, derived_property, general_category, property}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked}; #[stable(feature = "rust1", since = "1.0.0")] -pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDebug, EscapeDefault, EscapeUnicode}; +pub use core::char::{EscapeDebug, EscapeDefault, EscapeUnicode}; // unstable reexports #[unstable(feature = "try_from", issue = "33417")] @@ -435,50 +435,96 @@ impl char { C::len_utf16(self) } - /// Returns an iterator over the bytes of this character as UTF-8. + /// Encodes this character as UTF-8 into the provided byte buffer, + /// and then returns the subslice of the buffer that contains the encoded character. /// - /// The returned iterator also has an `as_slice()` method to view the - /// encoded bytes as a byte slice. + /// # Panics + /// + /// Panics if the buffer is not large enough. + /// A buffer of length four is large enough to encode any `char`. /// /// # Examples /// + /// In both of these examples, 'ß' takes two bytes to encode. + /// /// ``` /// #![feature(unicode)] /// - /// let iterator = 'ß'.encode_utf8(); - /// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]); + /// let mut b = [0; 2]; /// - /// for (i, byte) in iterator.enumerate() { - /// println!("byte {}: {:x}", i, byte); - /// } + /// let result = 'ß'.encode_utf8(&mut b); + /// + /// assert_eq!(result, "ß"); + /// + /// assert_eq!(result.len(), 2); + /// ``` + /// + /// A buffer that's too small: + /// + /// ``` + /// #![feature(unicode)] + /// use std::thread; + /// + /// let result = thread::spawn(|| { + /// let mut b = [0; 1]; + /// + /// // this panics + /// 'ß'.encode_utf8(&mut b); + /// }).join(); + /// + /// assert!(result.is_err()); /// ``` - #[unstable(feature = "unicode", issue = "27784")] + #[unstable(feature = "unicode", + reason = "pending decision about Iterator/Writer/Reader", + issue = "27784")] #[inline] - pub fn encode_utf8(self) -> EncodeUtf8 { - C::encode_utf8(self) + pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { + C::encode_utf8(self, dst) } - /// Returns an iterator over the `u16` entries of this character as UTF-16. + /// Encodes this character as UTF-16 into the provided `u16` buffer, + /// and then returns the subslice of the buffer that contains the encoded character. /// - /// The returned iterator also has an `as_slice()` method to view the - /// encoded form as a slice. + /// # Panics + /// + /// Panics if the buffer is not large enough. + /// A buffer of length 2 is large enough to encode any `char`. /// /// # Examples /// + /// In both of these examples, '𝕊' takes two `u16`s to encode. + /// /// ``` /// #![feature(unicode)] /// - /// let iterator = '𝕊'.encode_utf16(); - /// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]); + /// let mut b = [0; 2]; /// - /// for (i, val) in iterator.enumerate() { - /// println!("entry {}: {:x}", i, val); - /// } + /// let result = '𝕊'.encode_utf16(&mut b); + /// + /// assert_eq!(result.len(), 2); /// ``` - #[unstable(feature = "unicode", issue = "27784")] + /// + /// A buffer that's too small: + /// + /// ``` + /// #![feature(unicode)] + /// use std::thread; + /// + /// let result = thread::spawn(|| { + /// let mut b = [0; 1]; + /// + /// // this panics + /// '𝕊'.encode_utf16(&mut b); + /// }).join(); + /// + /// assert!(result.is_err()); + /// ``` + #[unstable(feature = "unicode", + reason = "pending decision about Iterator/Writer/Reader", + issue = "27784")] #[inline] - pub fn encode_utf16(self) -> EncodeUtf16 { - C::encode_utf16(self) + pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { + C::encode_utf16(self, dst) } /// Returns true if this `char` is an alphabetic code point, and false if not. diff --git a/src/librustc_unicode/u_str.rs b/src/librustc_unicode/u_str.rs index eb5b6feeb7ec4..19e419e37a09c 100644 --- a/src/librustc_unicode/u_str.rs +++ b/src/librustc_unicode/u_str.rs @@ -157,13 +157,13 @@ impl Iterator for Utf16Encoder return Some(tmp); } + let mut buf = [0; 2]; self.chars.next().map(|ch| { - let n = CharExt::encode_utf16(ch); - let n = n.as_slice(); - if n.len() == 2 { - self.extra = n[1]; + let n = CharExt::encode_utf16(ch, &mut buf).len(); + if n == 2 { + self.extra = buf[1]; } - n[0] + buf[0] }) } diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs index 6ccc0be41bc0f..5e25c61bae995 100644 --- a/src/libserialize/json.rs +++ b/src/libserialize/json.rs @@ -433,9 +433,7 @@ fn escape_str(wr: &mut fmt::Write, v: &str) -> EncodeResult { } fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult { - escape_str(writer, unsafe { - str::from_utf8_unchecked(v.encode_utf8().as_slice()) - }) + escape_str(writer, v.encode_utf8(&mut [0; 4])) } fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult { diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs index 8d357aa78c9e9..0a94ff1e95823 100644 --- a/src/libstd/sys/common/wtf8.rs +++ b/src/libstd/sys/common/wtf8.rs @@ -206,10 +206,12 @@ impl Wtf8Buf { /// Copied from String::push /// This does **not** include the WTF-8 concatenation check. fn push_code_point_unchecked(&mut self, code_point: CodePoint) { - let bytes = unsafe { - char::from_u32_unchecked(code_point.value).encode_utf8() + let c = unsafe { + char::from_u32_unchecked(code_point.value) }; - self.bytes.extend_from_slice(bytes.as_slice()); + let mut bytes = [0; 4]; + let bytes = c.encode_utf8(&mut bytes).as_bytes(); + self.bytes.extend_from_slice(bytes) } #[inline] @@ -738,15 +740,16 @@ impl<'a> Iterator for EncodeWide<'a> { return Some(tmp); } + let mut buf = [0; 2]; self.code_points.next().map(|code_point| { - let n = unsafe { - char::from_u32_unchecked(code_point.value).encode_utf16() + let c = unsafe { + char::from_u32_unchecked(code_point.value) }; - let n = n.as_slice(); - if n.len() == 2 { - self.extra = n[1]; + let n = c.encode_utf16(&mut buf).len(); + if n == 2 { + self.extra = buf[1]; } - n[0] + buf[0] }) }