Add memtable flush pacing mechanism

cockroachdb · Jul 9, 2019 · 621dd5a · 621dd5a
1 parent a3902b0
commit 621dd5a
Show file tree

Hide file tree

Showing 26 changed files with 661 additions and 68 deletions.
diff --git a/batch.go b/batch.go
@@ -892,8 +892,10 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) *flushableBatch {
 			index:  uint32(index),
 		}
 		if keySize := uint32(len(key)); keySize == 0 {
-			entry.keyStart = uint32(offset)
-			entry.keyEnd = uint32(offset)
+			// Must add 2 to the offset. One byte encodes `kind` and the next
+			// byte encodes `0`, which is the length of the key.
+			entry.keyStart = uint32(offset)+2
+			entry.keyEnd = entry.keyStart
 		} else {
 			entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
 				uintptr(unsafe.Pointer(&b.data[0])))
@@ -949,6 +951,19 @@ func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
 	}
 }
 
+func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
+	return &flushFlushableBatchIter{
+		flushableBatchIter: flushableBatchIter{
+			batch:   b,
+			data:    b.data,
+			offsets: b.offsets,
+			cmp:     b.cmp,
+			index:   -1,
+		},
+		bytesIterated: bytesFlushed,
+	}
+}
+
 func (b *flushableBatch) newRangeDelIter(o *IterOptions) internalIterator {
 	if len(b.rangeDelOffsets) == 0 {
 		return nil
@@ -983,6 +998,10 @@ func (b *flushableBatch) newRangeDelIter(o *IterOptions) internalIterator {
 	return rangedel.NewIter(b.cmp, b.tombstones)
 }
 
+func (b *flushableBatch) totalBytes() uint64 {
+	return uint64(len(b.data) - batchHeaderLen)
+}
+
 func (b *flushableBatch) flushed() chan struct{} {
 	return b.flushedCh
 }
@@ -1076,6 +1095,8 @@ func (i *flushableBatchIter) Last() (*InternalKey, []byte) {
 	return &i.key, i.Value()
 }
 
+// Note: flushFlushableBatchIter.Next mirrors the implementation of
+// flushableBatchIter.Next due to performance. Keep the two in sync.
 func (i *flushableBatchIter) Next() (*InternalKey, []byte) {
 	if i.index == len(i.offsets) {
 		return nil, nil
@@ -1120,10 +1141,26 @@ func (i *flushableBatchIter) Key() *InternalKey {
 }
 
 func (i *flushableBatchIter) Value() []byte {
-	offset := i.offsets[i.index].offset
-	_, _, value, ok := batchDecode(i.batch.data, offset)
-	if !ok {
+	p := i.data[i.offsets[i.index].offset:]
+	if len(p) == 0 {
+		i.err = fmt.Errorf("corrupted batch")
+		return nil
+	}
+	kind := InternalKeyKind(p[0])
+	if kind > InternalKeyKindMax {
 		i.err = fmt.Errorf("corrupted batch")
+		return nil
+	}
+	var value []byte
+	var ok bool
+	switch kind {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
+		keyEnd := i.offsets[i.index].keyEnd
+		_, value, ok = batchDecodeStr(i.data[keyEnd:])
+		if !ok {
+			i.err = fmt.Errorf("corrupted batch")
+			return nil
+		}
 	}
 	return value
 }
@@ -1144,3 +1181,80 @@ func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
 	i.lower = lower
 	i.upper = upper
 }
+
+// flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
+// of number of bytes iterated.
+type flushFlushableBatchIter struct {
+	flushableBatchIter
+	bytesIterated *uint64
+}
+
+// flushFlushableBatchIter implements the internalIterator interface.
+var _ internalIterator = (*flushFlushableBatchIter)(nil)
+
+func (i *flushFlushableBatchIter) SeekGE(key []byte) (*InternalKey, []byte) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (i *flushFlushableBatchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (i *flushFlushableBatchIter) SeekLT(key []byte) (*InternalKey, []byte) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+func (i *flushFlushableBatchIter) First() (*InternalKey, []byte) {
+	key, val := i.flushableBatchIter.First()
+	if key == nil {
+		return nil, nil
+	}
+	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
+	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
+	return key, val
+}
+
+// Note: flushFlushableBatchIter.Next mirrors the implementation of
+// flushableBatchIter.Next due to performance. Keep the two in sync.
+func (i *flushFlushableBatchIter) Next() (*InternalKey, []byte) {
+	if i.index == len(i.offsets) {
+		return nil, nil
+	}
+	i.index++
+	if i.index == len(i.offsets) {
+		return nil, nil
+	}
+	i.key = i.getKey(i.index)
+	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
+	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
+	return &i.key, i.Value()
+}
+
+func (i flushFlushableBatchIter) Prev() (*InternalKey, []byte) {
+	panic("pebble: Prev unimplemented")
+}
+
+func (i flushFlushableBatchIter) valueSize() uint64 {
+	p := i.data[i.offsets[i.index].offset:]
+	if len(p) == 0 {
+		i.err = fmt.Errorf("corrupted batch")
+		return 0
+	}
+	kind := InternalKeyKind(p[0])
+	if kind > InternalKeyKindMax {
+		i.err = fmt.Errorf("corrupted batch")
+		return 0
+	}
+	var length uint64
+	switch kind {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
+		keyEnd := i.offsets[i.index].keyEnd
+		v, n := binary.Uvarint(i.data[keyEnd:])
+		if n <= 0 {
+			i.err = fmt.Errorf("corrupted batch")
+			return 0
+		}
+		length = v + uint64(n)
+	}
+	return length
+}
diff --git a/batch_test.go b/batch_test.go
@@ -412,6 +412,27 @@ func TestFlushableBatchDeleteRange(t *testing.T) {
 	})
 }
 
+func TestFlushableBatchBytesIterated(t *testing.T) {
+	batch := newBatch(nil)
+	for j := 0; j < 1000; j++ {
+		key := make([]byte, 8 + j%3)
+		value := make([]byte, 7 + j%5)
+		batch.Set(key, value, nil)
+
+		fb := newFlushableBatch(batch, DefaultComparer)
+
+		var bytesIterated uint64
+		it := fb.newFlushIter(nil, &bytesIterated)
+
+		for it.First(); it.Valid(); it.Next() {}
+
+		expected := fb.totalBytes()
+		if bytesIterated != expected {
+			t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
+		}
+	}
+}
+
 func BenchmarkBatchSet(b *testing.B) {
 	value := make([]byte, 10)
 	for i := range value {

diff --git a/cmd/pebble/db.go b/cmd/pebble/db.go
@@ -54,6 +54,7 @@ func newPebbleDB(dir string) DB {
 		DisableWAL:                  disableWAL,
 		MemTableSize:                64 << 20,
 		MemTableStopWritesThreshold: 4,
+		MinFlushRate:                4 << 20,
 		L0CompactionThreshold:       2,
 		L0SlowdownWritesThreshold:   20,
 		L0StopWritesThreshold:       32,

diff --git a/compaction.go b/compaction.go
@@ -6,12 +6,14 @@ package pebble
 
 import (
 	"bytes"
+	"context"
 	"errors"
 	"fmt"
 	"math"
 	"os"
 	"sort"
 	"sync/atomic"
+	"time"
 	"unsafe"
 
 	"github.com/petermattis/pebble/internal/base"
@@ -66,6 +68,8 @@ type compaction struct {
 
 	// flushing contains the flushables (aka memtables) that are being flushed.
 	flushing []flushable
+	// bytesIterated contains the number of bytes that have been flushed/compacted.
+	bytesIterated uint64
 	// inputs are the tables to be compacted.
 	inputs [2][]fileMetadata
 
@@ -461,7 +465,7 @@ func (c *compaction) newInputIter(
 	if len(c.flushing) != 0 {
 		if len(c.flushing) == 1 {
 			f := c.flushing[0]
-			iter := f.newIter(nil)
+			iter := f.newFlushIter(nil, &c.bytesIterated)
 			if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
 				return newMergingIter(c.cmp, iter, rangeDelIter), nil
 			}
@@ -470,7 +474,7 @@ func (c *compaction) newInputIter(
 		iters := make([]internalIterator, 0, 2*len(c.flushing))
 		for i := range c.flushing {
 			f := c.flushing[i]
-			iters = append(iters, f.newIter(nil))
+			iters = append(iters, f.newFlushIter(nil, &c.bytesIterated))
 			rangeDelIter := f.newRangeDelIter(nil)
 			if rangeDelIter != nil {
 				iters = append(iters, rangeDelIter)
@@ -498,9 +502,9 @@ func (c *compaction) newInputIter(
 	// one which iterates over the range deletions. These two iterators are
 	// combined with a mergingIter.
 	newRangeDelIter := func(
-		f *fileMetadata, _ *IterOptions,
+		f *fileMetadata, _ *IterOptions, bytesIterated *uint64,
 	) (internalIterator, internalIterator, error) {
-		iter, rangeDelIter, err := newIters(f, nil /* iter options */)
+		iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated)
 		if err == nil {
 			// TODO(peter): It is mildly wasteful to open the point iterator only to
 			// immediately close it. One way to solve this would be to add new
@@ -524,12 +528,12 @@ func (c *compaction) newInputIter(
 	}
 
 	if c.startLevel != 0 {
-		iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[0]))
-		iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[0]))
+		iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[0], &c.bytesIterated))
+		iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[0], &c.bytesIterated))
 	} else {
 		for i := range c.inputs[0] {
 			f := &c.inputs[0][i]
-			iter, rangeDelIter, err := newIters(f, nil /* iter options */)
+			iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated)
 			if err != nil {
 				return nil, fmt.Errorf("pebble: could not open table %d: %v", f.fileNum, err)
 			}
@@ -540,8 +544,8 @@ func (c *compaction) newInputIter(
 		}
 	}
 
-	iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[1]))
-	iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[1]))
+	iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[1], &c.bytesIterated))
+	iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[1], &c.bytesIterated))
 	return newMergingIter(c.cmp, iters...), nil
 }
 
@@ -990,7 +994,58 @@ func (d *DB) runCompaction(c *compaction) (
 		return nil
 	}
 
+	var prevBytesIterated uint64
+	var iterCount int
+	totalBytes := d.memTableTotalBytes()
+	refreshDirtyBytesThreshold := uint64(d.opts.MemTableSize * 5/100)
+
 	for key, val := iter.First(); key != nil; key, val = iter.Next() {
+		// Slow down memtable flushing to match fill rate.
+		if c.flushing != nil {
+			// Recalculate total memtable bytes only once every 1000 iterations or
+			// when the refresh threshold is hit since getting the total memtable
+			// byte count requires grabbing DB.mu which is expensive.
+			if iterCount >= 1000 || c.bytesIterated > refreshDirtyBytesThreshold {
+				totalBytes = d.memTableTotalBytes()
+				refreshDirtyBytesThreshold = c.bytesIterated + uint64(d.opts.MemTableSize * 5/100)
+				iterCount = 0
+			}
+			iterCount++
+
+			// dirtyBytes is the total number of bytes in the memtables minus the number of
+			// bytes flushed. It represents unflushed bytes in all the memtables, even the
+			// ones which aren't being flushed such as the mutable memtable.
+			dirtyBytes := totalBytes - c.bytesIterated
+			flushAmount := c.bytesIterated - prevBytesIterated
+			prevBytesIterated = c.bytesIterated
+
+			// We slow down memtable flushing when the dirty bytes indicator falls
+			// below the low watermark, which is 105% memtable size. This will only
+			// occur if memtable flushing can keep up with the pace of incoming
+			// writes. If writes come in faster than how fast the memtable can flush,
+			// flushing proceeds at maximum (unthrottled) speed.
+			if dirtyBytes <= uint64(d.opts.MemTableSize * 105/100) {
+				burst := d.flushLimiter.Burst()
+				for flushAmount > uint64(burst) {
+					err := d.flushLimiter.WaitN(context.Background(), burst)
+					if err != nil {
+						return nil, pendingOutputs, err
+					}
+					flushAmount -= uint64(burst)
+				}
+				err := d.flushLimiter.WaitN(context.Background(), int(flushAmount))
+				if err != nil {
+					return nil, pendingOutputs, err
+				}
+			} else {
+				burst := d.flushLimiter.Burst()
+				for flushAmount > uint64(burst) {
+					d.flushLimiter.AllowN(time.Now(), burst)
+					flushAmount -= uint64(burst)
+				}
+				d.flushLimiter.AllowN(time.Now(), int(flushAmount))
+			}
+		}
 		// TODO(peter,rangedel): Need to incorporate the range tombstones in the
 		// shouldStopBefore decision.
 		if tw != nil && (tw.EstimatedSize() >= c.maxOutputFileSize || c.shouldStopBefore(*key)) {
@@ -1033,6 +1088,17 @@ func (d *DB) runCompaction(c *compaction) (
 	return ve, pendingOutputs, nil
 }
 
+// memTableTotalBytes returns the total number of bytes in the memtables. Note
+// that this includes the mutable memtable as well.
+func (d *DB) memTableTotalBytes() (totalBytes uint64) {
+	d.mu.Lock()
+	for _, m := range d.mu.mem.queue {
+		totalBytes += m.totalBytes()
+	}
+	d.mu.Unlock()
+	return totalBytes
+}
+
 // scanObsoleteFiles scans the filesystem for files that are no longer needed
 // and adds those to the internal lists of obsolete files. Note that he files
 // are not actually deleted by this method. A subsequent call to