Skip to content

Commit

Permalink
Merge pull request #3735 from resistor/main
Browse files Browse the repository at this point in the history
Implement a fast path for character counting in wc.
  • Loading branch information
sylvestre authored Jul 22, 2022
2 parents f82ada6 + 417ad0e commit ec9130a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
30 changes: 30 additions & 0 deletions src/uu/wc/src/count_fast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,33 @@ pub(crate) fn count_bytes_and_lines_fast<R: Read>(
}
}
}

/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader.
///
/// This corresponds to the `-m` command line flag to wc.
///
/// # Arguments
///
/// * `R` - A Reader from which the UTF-8 stream will be read.
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000u8;

let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return (total, None),
Ok(n) => {
total.chars += buf[..n]
.iter()
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
.count();
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return (total, Some(e)),
}
}
}
3 changes: 2 additions & 1 deletion src/uu/wc/src/wc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ extern crate uucore;
mod count_fast;
mod countable;
mod word_count;
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
use countable::WordCountable;
use unicode_width::UnicodeWidthChar;
use utf8::{BufReadDecoder, BufReadDecoderError};
Expand Down Expand Up @@ -315,6 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
) {
// Specialize scanning loop to improve the performance.
(false, false, false, false, false) => unreachable!(),
(false, true, false, false, false) => count_chars_fast(&mut reader),
(true, false, false, false, false) => {
// Fast path when only show_bytes is true.
let (bytes, error) = count_bytes_fast(&mut reader);
Expand Down

0 comments on commit ec9130a

Please sign in to comment.