diff --git a/README.md b/README.md
index 98312930c7..9bb3def74e 100644
--- a/README.md
+++ b/README.md
@@ -272,6 +272,10 @@ The packages contains the same as the standard library, so you can use the godoc
 
 Currently there is only minor speedup on decompression (mostly CRC32 calculation).
 
+Memory usage is typically 1MB for a Writer. stdlib is in the same range. 
+If you expect to have a lot of concurrently allocated Writers consider using 
+the stateless compress described below.
+
 # Stateless compression
 
 This package offers stateless compression as a special option for gzip/deflate. 
diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
index 678f081052..347ac2c902 100644
--- a/flate/fast_encoder.go
+++ b/flate/fast_encoder.go
@@ -45,7 +45,7 @@ const (
 
 	bTableBits   = 17                                               // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                                  // Size of the table
-	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )
 
diff --git a/flate/huffman_code.go b/flate/huffman_code.go
index 50fb2718f3..67b2b38728 100644
--- a/flate/huffman_code.go
+++ b/flate/huffman_code.go
@@ -21,9 +21,13 @@ type hcode struct {
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
@@ -306,12 +310,6 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
-		// The largest of these is literalCount, so we allocate for that case.
-		h.freqcache = make([]literalNode, literalCount+1)
-	}
 	list := h.freqcache[:len(freq)+1]
 	// Number of non-zero literals
 	count := 0
diff --git a/flate/reader_test.go b/flate/reader_test.go
index 55439646d7..f851c3227e 100644
--- a/flate/reader_test.go
+++ b/flate/reader_test.go
@@ -6,6 +6,8 @@ package flate
 
 import (
 	"bytes"
+	"compress/flate"
+	"fmt"
 	"io"
 	"io/ioutil"
 	"runtime"
@@ -13,6 +15,52 @@ import (
 	"testing"
 )
 
+func TestMemUsage(t *testing.T) {
+	testMem := func(t *testing.T, fn func()) {
+		var before, after runtime.MemStats
+		runtime.GC()
+		runtime.ReadMemStats(&before)
+		fn()
+		runtime.GC()
+		runtime.ReadMemStats(&after)
+		t.Logf("%s: Memory Used: %dKB, %d allocs", t.Name(), (after.HeapInuse-before.HeapInuse)/1024, after.HeapObjects-before.HeapObjects)
+	}
+	data := make([]byte, 100000)
+	t.Run(fmt.Sprint("stateless"), func(t *testing.T) {
+		testMem(t, func() {
+			StatelessDeflate(ioutil.Discard, data, false, nil)
+		})
+	})
+	for level := HuffmanOnly; level <= BestCompression; level++ {
+		t.Run(fmt.Sprint("level-", level), func(t *testing.T) {
+			var zr *Writer
+			var err error
+			testMem(t, func() {
+				zr, err = NewWriter(ioutil.Discard, level)
+				if err != nil {
+					t.Fatal(err)
+				}
+				zr.Write(data)
+			})
+			zr.Close()
+		})
+	}
+	for level := HuffmanOnly; level <= BestCompression; level++ {
+		t.Run(fmt.Sprint("stdlib-", level), func(t *testing.T) {
+			var zr *flate.Writer
+			var err error
+			testMem(t, func() {
+				zr, err = flate.NewWriter(ioutil.Discard, level)
+				if err != nil {
+					t.Fatal(err)
+				}
+				zr.Write(data)
+			})
+			zr.Close()
+		})
+	}
+}
+
 func TestNlitOutOfRange(t *testing.T) {
 	// Trying to decode this bogus flate data, which has a Huffman table
 	// with nlit=288, should not panic.