diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 555225a067d..5c8d97afdc7 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -134,3 +134,33 @@ pub(crate) fn count_bytes_and_lines_fast( } } } + +/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader. +/// +/// This corresponds to the `-m` command line flag to wc. +/// +/// # Arguments +/// +/// * `R` - A Reader from which the UTF-8 stream will be read. +pub(crate) fn count_chars_fast(handle: &mut R) -> (WordCount, Option) { + /// Mask of the value bits of a continuation byte + const CONT_MASK: u8 = 0b0011_1111u8; + /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte + const TAG_CONT_U8: u8 = 0b1000_0000u8; + + let mut total = WordCount::default(); + let mut buf = [0; BUF_SIZE]; + loop { + match handle.read(&mut buf) { + Ok(0) => return (total, None), + Ok(n) => { + total.chars += buf[..n] + .iter() + .filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8) + .count(); + } + Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, + Err(e) => return (total, Some(e)), + } + } +} diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index e938c1bf0af..91d1feeed9f 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -13,7 +13,7 @@ extern crate uucore; mod count_fast; mod countable; mod word_count; -use count_fast::{count_bytes_and_lines_fast, count_bytes_fast}; +use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast}; use countable::WordCountable; use unicode_width::UnicodeWidthChar; use utf8::{BufReadDecoder, BufReadDecoderError}; @@ -315,6 +315,7 @@ fn word_count_from_reader( ) { // Specialize scanning loop to improve the performance. (false, false, false, false, false) => unreachable!(), + (false, true, false, false, false) => count_chars_fast(&mut reader), (true, false, false, false, false) => { // Fast path when only show_bytes is true. let (bytes, error) = count_bytes_fast(&mut reader);