From b2079e33f176bd62ac368a236f2f9e0ca44ed5b0 Mon Sep 17 00:00:00 2001
From: Jonathan Giddy <jgiddy@cloudflare.com>
Date: Sun, 30 Jul 2023 12:12:41 +0100
Subject: [PATCH] Document that `read::GzDecoder` consumes bytes after end of
 gzip

Add tests showing that the `GzDecoder`s in `bufread` and `write`
support reading immediately after end of gzip data.

Co-authored-by: Sebastian Thiel <sebastian.thiel@icloud.com>
---
 src/gz/bufread.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 src/gz/read.rs    | 14 ++++++++++++--
 src/gz/write.rs   | 28 ++++++++++++++++++++++++++++
 src/lib.rs        |  7 +++++++
 4 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs
index e59ebc0f..6fc48bcd 100644
--- a/src/gz/bufread.rs
+++ b/src/gz/bufread.rs
@@ -432,3 +432,50 @@ impl<R: BufRead> Read for MultiGzDecoder<R> {
         self.0.read(into)
     }
 }
+
+#[cfg(test)]
+mod test {
+    use crate::bufread::GzDecoder;
+    use crate::gz::write;
+    use crate::Compression;
+    use std::io::{Read, Write};
+
+    // GzDecoder consumes one gzip member and then returns 0 for subsequent reads, allowing any
+    // additional data to be consumed by the caller.
+    #[test]
+    fn decode_extra_data() {
+        let expected = "Hello World";
+
+        let compressed = {
+            let mut e = write::GzEncoder::new(Vec::new(), Compression::default());
+            e.write(expected.as_ref()).unwrap();
+            let mut b = e.finish().unwrap();
+            b.push(b'x');
+            b
+        };
+
+        let mut output = Vec::new();
+        let mut decoder = GzDecoder::new(compressed.as_slice());
+        let decoded_bytes = decoder.read_to_end(&mut output).unwrap();
+        assert_eq!(decoded_bytes, output.len());
+        let actual = std::str::from_utf8(&output).expect("String parsing error");
+        assert_eq!(
+            actual, expected,
+            "after decompression we obtain the original input"
+        );
+
+        output.clear();
+        assert_eq!(
+            decoder.read(&mut output).unwrap(),
+            0,
+            "subsequent read of decoder returns 0, but inner reader can return additional data"
+        );
+        let mut reader = decoder.into_inner();
+        assert_eq!(
+            reader.read_to_end(&mut output).unwrap(),
+            1,
+            "extra data is accessible in underlying buf-read"
+        );
+        assert_eq!(output, b"x");
+    }
+}
diff --git a/src/gz/read.rs b/src/gz/read.rs
index 8732fdc2..5a65526c 100644
--- a/src/gz/read.rs
+++ b/src/gz/read.rs
@@ -90,7 +90,7 @@ impl<R: Read + Write> Write for GzEncoder<R> {
     }
 }
 
-/// A decoder for the first member of a [gzip file].
+/// A decoder for a single member of a [gzip file].
 ///
 /// This structure exposes a [`Read`] interface that will consume compressed
 /// data from the underlying reader and emit uncompressed data.
@@ -155,6 +155,9 @@ impl<R> GzDecoder<R> {
     }
 
     /// Acquires a reference to the underlying reader.
+    ///
+    /// Note that the decoder may have read past the end of the gzip data.
+    /// To prevent this use [`bufread::GzDecoder`] instead.
     pub fn get_ref(&self) -> &R {
         self.inner.get_ref().get_ref()
     }
@@ -162,12 +165,19 @@ impl<R> GzDecoder<R> {
     /// Acquires a mutable reference to the underlying stream.
     ///
     /// Note that mutation of the stream may result in surprising results if
-    /// this decoder is continued to be used.
+    /// this decoder continues to be used.
+    ///
+    /// Note that the decoder may have read past the end of the gzip data.
+    /// To prevent this use [`bufread::GzDecoder`] instead.
     pub fn get_mut(&mut self) -> &mut R {
         self.inner.get_mut().get_mut()
     }
 
     /// Consumes this decoder, returning the underlying reader.
+    ///
+    /// Note that the decoder may have read past the end of the gzip data.
+    /// Subsequent reads will skip those bytes. To prevent this use
+    /// [`bufread::GzDecoder`] instead.
     pub fn into_inner(self) -> R {
         self.inner.into_inner().into_inner()
     }
diff --git a/src/gz/write.rs b/src/gz/write.rs
index d5e8b8e5..74d6c5ac 100644
--- a/src/gz/write.rs
+++ b/src/gz/write.rs
@@ -610,4 +610,32 @@ mod tests {
         let expected = STR.repeat(2);
         assert_eq!(return_string, expected);
     }
+
+    // GzDecoder consumes one gzip member and then returns 0 for subsequent writes, allowing any
+    // additional data to be consumed by the caller.
+    #[test]
+    fn decode_extra_data() {
+        let compressed = {
+            let mut e = GzEncoder::new(Vec::new(), Compression::default());
+            e.write(STR.as_ref()).unwrap();
+            let mut b = e.finish().unwrap();
+            b.push(b'x');
+            b
+        };
+
+        let mut writer = Vec::new();
+        let mut decoder = GzDecoder::new(writer);
+        let mut consumed_bytes = 0;
+        loop {
+            let n = decoder.write(&compressed[consumed_bytes..]).unwrap();
+            if n == 0 {
+                break;
+            }
+            consumed_bytes += n;
+        }
+        writer = decoder.finish().unwrap();
+        let actual = String::from_utf8(writer).expect("String parsing error");
+        assert_eq!(actual, STR);
+        assert_eq!(&compressed[consumed_bytes..], b"x");
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 127e2354..8c000b03 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -117,7 +117,14 @@ mod zlib;
 /// Types which operate over [`Read`] streams, both encoders and decoders for
 /// various formats.
 ///
+/// Note that the `read` decoder types may read past the end of the compressed
+/// data while decoding. If the caller requires subsequent reads to start
+/// immediately following the compressed data  wrap the `Read` type in a
+/// [`BufReader`] and use the `BufReader` with the equivalent decoder from the
+/// `bufread` module and also for the subsequent reads.
+///
 /// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html
+/// [`BufReader`]: https://doc.rust-lang.org/std/io/struct.BufReader.html
 pub mod read {
     pub use crate::deflate::read::DeflateDecoder;
     pub use crate::deflate::read::DeflateEncoder;