From ab29171bdf91c7f135bdd7e9b0409916f010ea46 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Tue, 9 Aug 2022 22:14:21 -0700 Subject: [PATCH] Fix a bug in split where chunking would be skipped when the chunk size happened to be an exact divisor of the buffer size used to read the input stream. The issue here was that file was being split byte-wise in chunks of 1G. The input stream was being read in chunks of 8KB, which evenly divides the chunk size. Because the check to allocate the next output chunk was done at the bottom of the loop previously, it would never occur because the current input chunk was fully consumed at that point. By moving the check to the top of the loop (but still late enough that we know we have bytes to write) we resolve this issue. This scenario is unfortunately hard to write a test for, since we don't explicitly control the input chunk size. Fixes https://github.com/uutils/coreutils/issues/3790 --- src/uu/split/src/split.rs | 37 ++++++++++++++++--------------------- tests/by-util/test_split.rs | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index ad93fd8435e..37e535b6b8f 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -618,6 +618,21 @@ impl<'a> Write for ByteChunkWriter<'a> { return Ok(carryover_bytes_written); } + if self.num_bytes_remaining_in_current_chunk == 0 { + // Increment the chunk number, reset the number of bytes remaining, and instantiate the new underlying writer. + self.num_chunks_written += 1; + self.num_bytes_remaining_in_current_chunk = self.chunk_size; + + // Allocate the new file, since at this point we know there are bytes to be written to it. + let filename = self.filename_iterator.next().ok_or_else(|| { + std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted") + })?; + if self.settings.verbose { + println!("creating file {}", filename.quote()); + } + self.inner = self.settings.instantiate_current_writer(&filename)?; + } + // If the capacity of this chunk is greater than the number of // bytes in `buf`, then write all the bytes in `buf`. Otherwise, // write enough bytes to fill the current chunk, then increment @@ -635,38 +650,18 @@ impl<'a> Write for ByteChunkWriter<'a> { // n, which is already usize. let i = self.num_bytes_remaining_in_current_chunk as usize; let num_bytes_written = self.inner.write(&buf[..i])?; + self.num_bytes_remaining_in_current_chunk -= num_bytes_written as u64; // It's possible that the underlying writer did not // write all the bytes. if num_bytes_written < i { - self.num_bytes_remaining_in_current_chunk -= num_bytes_written as u64; return Ok(carryover_bytes_written + num_bytes_written); } else { // Move the window to look at only the remaining bytes. buf = &buf[i..]; - // Increment the chunk number, reset the number of - // bytes remaining, and instantiate the new - // underlying writer. - self.num_chunks_written += 1; - self.num_bytes_remaining_in_current_chunk = self.chunk_size; - // Remember for the next iteration that we wrote these bytes. carryover_bytes_written += num_bytes_written; - - // Only create the writer for the next chunk if - // there are any remaining bytes to write. This - // check prevents us from creating a new empty - // file. - if !buf.is_empty() { - let filename = self.filename_iterator.next().ok_or_else(|| { - std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted") - })?; - if self.settings.verbose { - println!("creating file {}", filename.quote()); - } - self.inner = self.settings.instantiate_current_writer(&filename)?; - } } } } diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index f9cfcdbf9b4..355fea2c77b 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -683,3 +683,18 @@ fn test_guard_input() { .stderr_only("split: 'xaa' would overwrite input; aborting"); assert_eq!(at.read("xaa"), "1\n2\n3\n"); } + +#[test] +fn test_multiple_of_input_chunk() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "multiple_of_input_chunk"; + RandomFile::new(&at, name).add_bytes(16 * 1024); + ucmd.args(&["-b", "8K", name, "b"]).succeeds(); + + let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + for filename in glob.collect() { + assert_eq!(glob.directory.metadata(&filename).len(), 8 * 1024); + } + assert_eq!(glob.collate(), at.read_bytes(name)) +}