Skip to content

Commit

Permalink
Rollup merge of #86497 - clarfonthey:nearest_char_boundary, r=scottmcm
Browse files Browse the repository at this point in the history
Add {floor,ceil}_char_boundary methods to str

This is technically already used internally by the standard library in the form of `truncate_to_char_boundary`.

Essentially these are two building blocks to allow for approximate string truncation, where you want to cut off the string at "approximately" a given length in bytes but don't know exactly where the character boundaries lie. It's also a good candidate for the standard library as it can easily be done naively, but would be difficult to properly optimise. Although the existing code that's done in error messages is done naively, this code will explicitly only check a window of 4 bytes since we know that a boundary must lie in that range, and because it will make it possible to vectorise.

Although this method doesn't take into account graphemes or other properties, this would still be a required building block for splitting that takes those into account. For example, if you wanted to split at a grapheme boundary, you could take your approximate splitting point and then determine the graphemes immediately following and preceeding the split. If you then notice that these two graphemes could be merged, you can decide to either include the whole grapheme or exclude it depending on whether you decide splitting should shrink or expand the string.

This takes the most conservative approach and just offers the raw indices to the user, and they can decide how to use them. That way, the methods are as useful as possible despite having as few methods as possible.

(Note: I'll add some tests and a tracking issue if it's decided that this is worth including.)
  • Loading branch information
matthiaskrgr authored Feb 8, 2022
2 parents e7cc3bd + edd318c commit 1f841fc
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 23 deletions.
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#![feature(binary_heap_as_slice)]
#![feature(inplace_iteration)]
#![feature(iter_advance_by)]
#![feature(round_char_boundary)]
#![feature(slice_group_by)]
#![feature(slice_partition_dedup)]
#![feature(string_remove_matches)]
Expand Down
92 changes: 92 additions & 0 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2272,3 +2272,95 @@ fn utf8_char_counts() {
}
}
}

#[test]
fn floor_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.floor_char_boundary(idx),
ret,
"{:?}.floor_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}

// edge case
check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0);

// basic check
check_many("x", [0], 0);
check_many("x", [1, isize::MAX as usize, usize::MAX], 1);

// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", 2..4, 2);

// 2-byte chars
check_many("ĵƥ", 0..2, 0);
check_many("ĵƥ", 2..4, 2);
check_many("ĵƥ", 4..6, 4);

// 3-byte chars
check_many("日本", 0..3, 0);
check_many("日本", 3..6, 3);
check_many("日本", 6..8, 6);

// 4-byte chars
check_many("🇯🇵", 0..4, 0);
check_many("🇯🇵", 4..8, 4);
check_many("🇯🇵", 8..10, 8);
}

#[test]
fn ceil_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.ceil_char_boundary(idx),
ret,
"{:?}.ceil_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}

// edge case
check_many("", [0], 0);

// basic check
check_many("x", [0], 0);
check_many("x", [1], 1);

// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", [2], 2);

// 2-byte chars
check_many("ĵƥ", 0..=0, 0);
check_many("ĵƥ", 1..=2, 2);
check_many("ĵƥ", 3..=4, 4);

// 3-byte chars
check_many("日本", 0..=0, 0);
check_many("日本", 1..=3, 3);
check_many("日本", 4..=6, 6);

// 4-byte chars
check_many("🇯🇵", 0..=0, 0);
check_many("🇯🇵", 1..=4, 4);
check_many("🇯🇵", 5..=8, 8);
}

#[test]
#[should_panic]
fn ceil_char_boundary_above_len_panic() {
let _ = "x".ceil_char_boundary(2);
}
5 changes: 5 additions & 0 deletions library/core/src/num/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,11 @@ impl u8 {
pub fn escape_ascii(&self) -> ascii::EscapeDefault {
ascii::escape_default(*self)
}

pub(crate) fn is_utf8_char_boundary(self) -> bool {
// This is bit magic equivalent to: b < 128 || b >= 192
(self as i8) >= -0x40
}
}

#[lang = "u16"]
Expand Down
88 changes: 78 additions & 10 deletions library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,14 @@ use iter::MatchIndicesInternal;
use iter::SplitInternal;
use iter::{MatchesInternal, SplitNInternal};

use validations::truncate_to_char_boundary;

#[inline(never)]
#[cold]
#[track_caller]
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
const MAX_DISPLAY_LENGTH: usize = 256;
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
let ellipsis = if truncated { "[...]" } else { "" };
let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
let s_trunc = &s[..trunc_len];
let ellipsis = if trunc_len < s.len() { "[...]" } else { "" };

// 1. out of bounds
if begin > s.len() || end > s.len() {
Expand All @@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
// 3. character boundary
let index = if !s.is_char_boundary(begin) { begin } else { end };
// find the character
let mut char_start = index;
while !s.is_char_boundary(char_start) {
char_start -= 1;
}
let char_start = s.floor_char_boundary(index);
// `char_start` must be less than len and a char boundary
let ch = s[char_start..].chars().next().unwrap();
let char_range = char_start..char_start + ch.len_utf8();
Expand Down Expand Up @@ -215,8 +211,80 @@ impl str {
// code on higher opt-levels. See PR #84751 for more details.
None => index == self.len(),

// This is bit magic equivalent to: b < 128 || b >= 192
Some(&b) => (b as i8) >= -0x40,
Some(&b) => b.is_utf8_char_boundary(),
}
}

/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
///
/// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
/// exceed a given number of bytes. Note that this is done purely at the character level
/// and can still visually split graphemes, even though the underlying characters aren't
/// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only
/// includes 🧑 (person) instead.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.floor_char_boundary(13);
/// assert_eq!(closest, 10);
/// assert_eq!(&s[..closest], "❤️🧡");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn floor_char_boundary(&self, index: usize) -> usize {
if index >= self.len() {
self.len()
} else {
let lower_bound = index.saturating_sub(3);
let new_index = self.as_bytes()[lower_bound..=index]
.iter()
.rposition(|b| b.is_utf8_char_boundary());

// SAFETY: we know that the character boundary will be within four bytes
unsafe { lower_bound + new_index.unwrap_unchecked() }
}
}

/// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`.
///
/// This method is the natural complement to [`floor_char_boundary`]. See that method
/// for more details.
///
/// [`floor_char_boundary`]: str::floor_char_boundary
///
/// # Panics
///
/// Panics if `index > self.len()`.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.ceil_char_boundary(13);
/// assert_eq!(closest, 14);
/// assert_eq!(&s[..closest], "❤️🧡💛");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn ceil_char_boundary(&self, index: usize) -> usize {
if index > self.len() {
slice_error_fail(self, index, index)
} else {
let upper_bound = Ord::min(index + 4, self.len());
self.as_bytes()[index..upper_bound]
.iter()
.position(|b| b.is_utf8_char_boundary())
.map_or(upper_bound, |pos| pos + index)
}
}

Expand Down
13 changes: 0 additions & 13 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize {

/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;

// truncate `&str` to length at most equal to `max`
// return `true` if it were truncated, and the new str.
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}

0 comments on commit 1f841fc

Please sign in to comment.