Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make slice->str conversion and related functions const #90607

Merged
merged 2 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#![feature(const_btree_new)]
#![feature(const_default_impls)]
#![feature(const_trait_impl)]
#![feature(const_str_from_utf8)]

use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
Expand Down
64 changes: 61 additions & 3 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::assert_matches::assert_matches;
use std::borrow::Cow;
use std::cmp::Ordering::{Equal, Greater, Less};
use std::str::{from_utf8, from_utf8_unchecked};
Expand Down Expand Up @@ -883,6 +884,33 @@ fn test_is_utf8() {
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
}

#[test]
fn test_const_is_utf8() {
const _: () = {
// deny overlong encodings
assert!(from_utf8(&[0xc0, 0x80]).is_err());
assert!(from_utf8(&[0xc0, 0xae]).is_err());
assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err());
assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err());
assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());

// deny surrogates
assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err());
assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err());

assert!(from_utf8(&[0xC2, 0x80]).is_ok());
assert!(from_utf8(&[0xDF, 0xBF]).is_ok());
assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
};
}

#[test]
fn from_utf8_mostly_ascii() {
// deny invalid bytes embedded in long stretches of ascii
Expand All @@ -895,13 +923,43 @@ fn from_utf8_mostly_ascii() {
}
}

#[test]
fn const_from_utf8_mostly_ascii() {
const _: () = {
// deny invalid bytes embedded in long stretches of ascii
let mut i = 32;
while i < 64 {
let mut data = [0; 128];
data[i] = 0xC0;
assert!(from_utf8(&data).is_err());
data[i] = 0xC2;
assert!(from_utf8(&data).is_err());

i = i + 1;
}
};
}

#[test]
fn from_utf8_error() {
macro_rules! test {
($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => {
let error = from_utf8($input).unwrap_err();
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
assert_eq!(error.error_len(), $expected_error_len);
assert_matches!(error.valid_up_to(), $expected_valid_up_to);
assert_matches!(error.error_len(), $expected_error_len);

const _: () = {
match from_utf8($input) {
Err(error) => {
let valid_up_to = error.valid_up_to();
let error_len = error.error_len();

assert!(matches!(valid_up_to, $expected_valid_up_to));
assert!(matches!(error_len, $expected_error_len));
}
Ok(_) => unreachable!(),
}
};
};
}
test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
Expand Down
3 changes: 3 additions & 0 deletions library/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#![allow(explicit_outlives_requirements)]
//
// Library features for const fns:
#![feature(const_align_offset)]
#![feature(const_align_of_val)]
#![feature(const_alloc_layout)]
#![feature(const_arguments_as_str)]
Expand Down Expand Up @@ -130,6 +131,7 @@
#![feature(const_size_of_val)]
#![feature(const_slice_from_raw_parts)]
#![feature(const_slice_ptr_len)]
#![feature(const_str_from_utf8_unchecked_mut)]
#![feature(const_swap)]
#![feature(const_trait_impl)]
#![feature(const_type_id)]
Expand All @@ -138,6 +140,7 @@
#![feature(duration_consts_2)]
#![feature(ptr_metadata)]
#![feature(slice_ptr_get)]
#![feature(str_internals)]
#![feature(variant_count)]
#![feature(const_array_from_ref)]
#![feature(const_slice_from_ref)]
Expand Down
31 changes: 22 additions & 9 deletions library/core/src/str/converts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,16 @@ use super::Utf8Error;
/// assert_eq!("💖", sparkle_heart);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked(v) })
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
// This should use `?` again, once it's `const`
match run_utf8_validation(v) {
WaffleLapkin marked this conversation as resolved.
Show resolved Hide resolved
Ok(_) => {
// SAFETY: validation succeeded.
Ok(unsafe { from_utf8_unchecked(v) })
}
Err(err) => Err(err),
}
}

/// Converts a mutable slice of bytes to a mutable string slice.
Expand Down Expand Up @@ -119,10 +125,16 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked_mut(v) })
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
// This should use `?` again, once it's `const`
match run_utf8_validation(v) {
Ok(_) => {
// SAFETY: validation succeeded.
Ok(unsafe { from_utf8_unchecked_mut(v) })
}
Err(err) => Err(err),
}
}

/// Converts a slice of bytes to a string slice without checking
Expand Down Expand Up @@ -184,7 +196,8 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
#[inline]
#[must_use]
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "91005")]
pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
// SAFETY: the caller must guarantee that the bytes `v`
// are valid UTF-8, thus the cast to `*mut str` is safe.
// Also, the pointer dereference is safe because that pointer
Expand Down
12 changes: 9 additions & 3 deletions library/core/src/str/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ impl Utf8Error {
/// assert_eq!(1, error.valid_up_to());
/// ```
#[stable(feature = "utf8_error", since = "1.5.0")]
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
#[must_use]
#[inline]
pub fn valid_up_to(&self) -> usize {
pub const fn valid_up_to(&self) -> usize {
self.valid_up_to
}

Expand All @@ -94,10 +95,15 @@ impl Utf8Error {
///
/// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
#[stable(feature = "utf8_error_error_len", since = "1.20.0")]
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
#[must_use]
#[inline]
pub fn error_len(&self) -> Option<usize> {
self.error_len.map(|len| len as usize)
pub const fn error_len(&self) -> Option<usize> {
// This should become `map` again, once it's `const`
match self.error_len {
WaffleLapkin marked this conversation as resolved.
Show resolved Hide resolved
Some(len) => Some(len as usize),
None => None,
}
}
}

Expand Down
19 changes: 10 additions & 9 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,25 @@ use super::Utf8Error;
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}

/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}

/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}

#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
Expand Down Expand Up @@ -105,14 +105,15 @@ const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;

/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
#[inline]
fn contains_nonascii(x: usize) -> bool {
const fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}

/// Walks through `v` checking that it's a valid UTF-8 sequence,
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
#[inline(always)]
pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
#[rustc_const_unstable(feature = "str_internals", issue = "none")]
pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();

Expand Down Expand Up @@ -142,7 +143,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {

let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
let w = utf8_char_width(first);
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
Expand Down Expand Up @@ -230,7 +231,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
}

// https://tools.ietf.org/html/rfc3629
WaffleLapkin marked this conversation as resolved.
Show resolved Hide resolved
static UTF8_CHAR_WIDTH: [u8; 256] = [
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand All @@ -253,7 +254,7 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [
#[unstable(feature = "str_internals", issue = "none")]
#[must_use]
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
pub const fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}

Expand Down