From b2079e33f176bd62ac368a236f2f9e0ca44ed5b0 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy <jgiddy@cloudflare.com> Date: Sun, 30 Jul 2023 12:12:41 +0100 Subject: [PATCH] Document that `read::GzDecoder` consumes bytes after end of gzip Add tests showing that the `GzDecoder`s in `bufread` and `write` support reading immediately after end of gzip data. Co-authored-by: Sebastian Thiel <sebastian.thiel@icloud.com> --- src/gz/bufread.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/gz/read.rs | 14 ++++++++++++-- src/gz/write.rs | 28 ++++++++++++++++++++++++++++ src/lib.rs | 7 +++++++ 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index e59ebc0f..6fc48bcd 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -432,3 +432,50 @@ impl<R: BufRead> Read for MultiGzDecoder<R> { self.0.read(into) } } + +#[cfg(test)] +mod test { + use crate::bufread::GzDecoder; + use crate::gz::write; + use crate::Compression; + use std::io::{Read, Write}; + + // GzDecoder consumes one gzip member and then returns 0 for subsequent reads, allowing any + // additional data to be consumed by the caller. + #[test] + fn decode_extra_data() { + let expected = "Hello World"; + + let compressed = { + let mut e = write::GzEncoder::new(Vec::new(), Compression::default()); + e.write(expected.as_ref()).unwrap(); + let mut b = e.finish().unwrap(); + b.push(b'x'); + b + }; + + let mut output = Vec::new(); + let mut decoder = GzDecoder::new(compressed.as_slice()); + let decoded_bytes = decoder.read_to_end(&mut output).unwrap(); + assert_eq!(decoded_bytes, output.len()); + let actual = std::str::from_utf8(&output).expect("String parsing error"); + assert_eq!( + actual, expected, + "after decompression we obtain the original input" + ); + + output.clear(); + assert_eq!( + decoder.read(&mut output).unwrap(), + 0, + "subsequent read of decoder returns 0, but inner reader can return additional data" + ); + let mut reader = decoder.into_inner(); + assert_eq!( + reader.read_to_end(&mut output).unwrap(), + 1, + "extra data is accessible in underlying buf-read" + ); + assert_eq!(output, b"x"); + } +} diff --git a/src/gz/read.rs b/src/gz/read.rs index 8732fdc2..5a65526c 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -90,7 +90,7 @@ impl<R: Read + Write> Write for GzEncoder<R> { } } -/// A decoder for the first member of a [gzip file]. +/// A decoder for a single member of a [gzip file]. /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. @@ -155,6 +155,9 @@ impl<R> GzDecoder<R> { } /// Acquires a reference to the underlying reader. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// To prevent this use [`bufread::GzDecoder`] instead. pub fn get_ref(&self) -> &R { self.inner.get_ref().get_ref() } @@ -162,12 +165,19 @@ impl<R> GzDecoder<R> { /// Acquires a mutable reference to the underlying stream. /// /// Note that mutation of the stream may result in surprising results if - /// this decoder is continued to be used. + /// this decoder continues to be used. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// To prevent this use [`bufread::GzDecoder`] instead. pub fn get_mut(&mut self) -> &mut R { self.inner.get_mut().get_mut() } /// Consumes this decoder, returning the underlying reader. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// Subsequent reads will skip those bytes. To prevent this use + /// [`bufread::GzDecoder`] instead. pub fn into_inner(self) -> R { self.inner.into_inner().into_inner() } diff --git a/src/gz/write.rs b/src/gz/write.rs index d5e8b8e5..74d6c5ac 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -610,4 +610,32 @@ mod tests { let expected = STR.repeat(2); assert_eq!(return_string, expected); } + + // GzDecoder consumes one gzip member and then returns 0 for subsequent writes, allowing any + // additional data to be consumed by the caller. + #[test] + fn decode_extra_data() { + let compressed = { + let mut e = GzEncoder::new(Vec::new(), Compression::default()); + e.write(STR.as_ref()).unwrap(); + let mut b = e.finish().unwrap(); + b.push(b'x'); + b + }; + + let mut writer = Vec::new(); + let mut decoder = GzDecoder::new(writer); + let mut consumed_bytes = 0; + loop { + let n = decoder.write(&compressed[consumed_bytes..]).unwrap(); + if n == 0 { + break; + } + consumed_bytes += n; + } + writer = decoder.finish().unwrap(); + let actual = String::from_utf8(writer).expect("String parsing error"); + assert_eq!(actual, STR); + assert_eq!(&compressed[consumed_bytes..], b"x"); + } } diff --git a/src/lib.rs b/src/lib.rs index 127e2354..8c000b03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,7 +117,14 @@ mod zlib; /// Types which operate over [`Read`] streams, both encoders and decoders for /// various formats. /// +/// Note that the `read` decoder types may read past the end of the compressed +/// data while decoding. If the caller requires subsequent reads to start +/// immediately following the compressed data wrap the `Read` type in a +/// [`BufReader`] and use the `BufReader` with the equivalent decoder from the +/// `bufread` module and also for the subsequent reads. +/// /// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html +/// [`BufReader`]: https://doc.rust-lang.org/std/io/struct.BufReader.html pub mod read { pub use crate::deflate::read::DeflateDecoder; pub use crate::deflate::read::DeflateEncoder;