From 13762cae05bd36370bb23baf516ce7e98425187f Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Wed, 20 Jul 2022 22:33:03 -0700 Subject: [PATCH 1/2] Implement a fast path for character counting in wc. When wc is invoked with only the -m flag, we only need to count the number of Unicode characters in the input. In order to do so, we don't actually need to decode the input bytes into characters. Rather, we can simply count the number of non-continuation bytes in the UTF-8 stream, since every character will contain exactly one non-continuation byte. On my laptop, this speeds up `wc -m odyssey1024.txt` from 745ms to 109ms. --- src/uu/wc/src/count_fast.rs | 23 +++++++++++++++++++++++ src/uu/wc/src/wc.rs | 3 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 555225a067d..62b1a3ae5f8 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -134,3 +134,26 @@ pub(crate) fn count_bytes_and_lines_fast( } } } + +pub(crate) fn count_chars_fast(handle: &mut R) -> (WordCount, Option) { + /// Mask of the value bits of a continuation byte + const CONT_MASK: u8 = 0b0011_1111u8; + /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte + const TAG_CONT_U8: u8 = 0b1000_0000u8; + + let mut total = WordCount::default(); + let mut buf = [0; BUF_SIZE]; + loop { + match handle.read(&mut buf) { + Ok(0) => return (total, None), + Ok(n) => { + total.chars += buf[..n] + .iter() + .filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8) + .count(); + } + Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, + Err(e) => return (total, Some(e)), + } + } +} diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index e938c1bf0af..91d1feeed9f 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -13,7 +13,7 @@ extern crate uucore; mod count_fast; mod countable; mod word_count; -use count_fast::{count_bytes_and_lines_fast, count_bytes_fast}; +use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast}; use countable::WordCountable; use unicode_width::UnicodeWidthChar; use utf8::{BufReadDecoder, BufReadDecoderError}; @@ -315,6 +315,7 @@ fn word_count_from_reader( ) { // Specialize scanning loop to improve the performance. (false, false, false, false, false) => unreachable!(), + (false, true, false, false, false) => count_chars_fast(&mut reader), (true, false, false, false, false) => { // Fast path when only show_bytes is true. let (bytes, error) = count_bytes_fast(&mut reader); From 417ad0e3844449551e1aed34c147cd101ece1263 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Wed, 20 Jul 2022 23:32:50 -0700 Subject: [PATCH 2/2] Add rustdoc comment. --- src/uu/wc/src/count_fast.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 62b1a3ae5f8..5c8d97afdc7 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -135,6 +135,13 @@ pub(crate) fn count_bytes_and_lines_fast( } } +/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader. +/// +/// This corresponds to the `-m` command line flag to wc. +/// +/// # Arguments +/// +/// * `R` - A Reader from which the UTF-8 stream will be read. pub(crate) fn count_chars_fast(handle: &mut R) -> (WordCount, Option) { /// Mask of the value bits of a continuation byte const CONT_MASK: u8 = 0b0011_1111u8;