diff --git a/src/decoder/zlib.rs b/src/decoder/zlib.rs
index 2e183bee..d9eba517 100644
--- a/src/decoder/zlib.rs
+++ b/src/decoder/zlib.rs
@@ -12,8 +12,12 @@ pub(super) struct ZlibStream {
     /// The decoder sometimes wants inspect some already finished bytes for further decoding. So we
     /// keep a total of 32KB of decoded data available as long as more data may be appended.
     out_buffer: Vec<u8>,
-    /// The cursor position in the output stream as a buffer index.
+    /// The first index of `out_buffer` where new data can be written.
     out_pos: usize,
+    /// The first index of `out_buffer` that hasn't yet been passed to our client
+    /// (i.e. not yet appended to the `image_data` parameter of `fn decompress` or `fn
+    /// finish_compressed_chunks`).
+    read_pos: usize,
     /// Limit on how many bytes can be decompressed in total.  This field is mostly used for
     /// performance optimizations (e.g. to avoid allocating and zeroing out large buffers when only
     /// a small image is being decoded).
@@ -33,6 +37,7 @@ impl ZlibStream {
             started: false,
             out_buffer: Vec::new(),
             out_pos: 0,
+            read_pos: 0,
             max_total_output: usize::MAX,
             ignore_adler32: true,
         }
@@ -42,6 +47,7 @@ impl ZlibStream {
         self.started = false;
         self.out_buffer.clear();
         self.out_pos = 0;
+        self.read_pos = 0;
         self.max_total_output = usize::MAX;
         *self.state = Decompressor::new();
     }
@@ -94,6 +100,7 @@ impl ZlibStream {
         self.started = true;
         self.out_pos += out_consumed;
         self.transfer_finished_data(image_data);
+        self.compact_out_buffer_if_needed();
 
         Ok(in_consumed)
     }
@@ -128,11 +135,12 @@ impl ZlibStream {
                     transferred > 0 || out_consumed > 0,
                     "No more forward progress made in stream decoding."
                 );
+                self.compact_out_buffer_if_needed();
             }
         }
 
-        self.out_buffer.truncate(self.out_pos);
-        image_data.append(&mut self.out_buffer);
+        self.transfer_finished_data(image_data);
+        self.out_buffer.clear();
         Ok(())
     }
 
@@ -179,10 +187,37 @@ impl ZlibStream {
     }
 
     fn transfer_finished_data(&mut self, image_data: &mut Vec<u8>) -> usize {
-        let safe = self.out_pos.saturating_sub(CHUNCK_BUFFER_SIZE);
-        // TODO: allocation limits.
-        image_data.extend(self.out_buffer.drain(..safe));
-        self.out_pos -= safe;
-        safe
+        let transferred = &self.out_buffer[self.read_pos..self.out_pos];
+        image_data.extend_from_slice(transferred);
+        self.read_pos = self.out_pos;
+        transferred.len()
+    }
+
+    fn compact_out_buffer_if_needed(&mut self) {
+        // [PNG spec](https://www.w3.org/TR/2003/REC-PNG-20031110/#10Compression) says that
+        // "deflate/inflate compression with a sliding window (which is an upper bound on the
+        // distances appearing in the deflate stream) of at most 32768 bytes".
+        //
+        // `fdeflate` requires that we keep this many most recently decompressed bytes in the
+        // `out_buffer` - this allows referring back to them when handling "length and distance
+        // codes" in the deflate stream).
+        const LOOKBACK_SIZE: usize = 32768;
+
+        // Compact `self.out_buffer` when "needed".  Doing this conditionally helps to put an upper
+        // bound on the amortized cost of copying the data within `self.out_buffer`.
+        //
+        // TODO: The factor of 4 is an ad-hoc heuristic.  Consider measuring and using a different
+        // factor.  (Early experiments seem to indicate that factor of 4 is faster than a factor of
+        // 2 and 4 * `LOOKBACK_SIZE` seems like an acceptable memory trade-off.  Higher factors
+        // result in higher memory usage, but the compaction cost is lower - factor of 4 means
+        // that 1 byte gets copied during compaction for 3 decompressed bytes.)
+        if self.out_pos > LOOKBACK_SIZE * 4 {
+            // Only preserve the `lookback_buffer` and "throw away" the earlier prefix.
+            let lookback_buffer = self.out_pos.saturating_sub(LOOKBACK_SIZE)..self.out_pos;
+            let preserved_len = lookback_buffer.len();
+            self.out_buffer.copy_within(lookback_buffer, 0);
+            self.read_pos = preserved_len;
+            self.out_pos = preserved_len;
+        }
     }
 }