From 8e5423b1ffd4cb57600c7209cf038277278dace6 Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Wed, 8 Jul 2020 22:19:34 +0000 Subject: [PATCH] storage: optimize MVCCGarbageCollect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to this change, MVCCGarbageCollect performed a linear scan of all versions of a key, not just the versions being garbage collected. Given the pagination of deleting versions above this call, the linear behavior can result in quadratic runtime of GC when the number of versions vastly exceeds the page size. The benchmark results demonstrate the change's effectiveness. It's worth noting that for a single key with a single version, the change has a negative performance impact. I suspect this is due to the allocation of a key in order to construct the iterator. In cases involving more keys, I theorize the positive change is due to the fact that now the iterator is never seeked backwards due to the sorting of the keys. It's worth noting that since 20.1, the GC queue has been sending keys in the GC request in reverse order. I anticipate that this sorting is likely a good thing in that case too. The stepping optimization seemed important in the microbenchmarks for cases where most of the data was garbage. Without it, the change had small negative impact on performance. ``` name old time/op new time/op delta MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=2/deleteVersions=1-24 3.39µs ± 1% 3.96µs ± 0% +16.99% (p=0.004 n=6+5) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1-24 319µs ± 3% 10µs ±12% -96.88% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=16-24 319µs ± 2% 16µs ±10% -94.95% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=32-24 319µs ± 3% 21µs ± 5% -93.52% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=512-24 337µs ± 1% 182µs ± 3% -46.00% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1015-24 361µs ± 0% 353µs ± 2% -2.32% (p=0.010 n=4+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1023-24 361µs ± 3% 350µs ± 2% -3.14% (p=0.009 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=2/deleteVersions=1-24 2.00ms ± 3% 2.25ms ± 2% +12.53% (p=0.004 n=6+5) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=1-24 388ms ± 3% 16ms ± 5% -95.76% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=16-24 387ms ± 1% 27ms ± 3% -93.14% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=32-24 393ms ± 5% 35ms ± 4% -91.09% (p=0.002 n=6+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=512-24 463ms ± 4% 276ms ± 3% -40.43% (p=0.004 n=5+6) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=1015-24 539ms ± 5% 514ms ± 3% -4.64% (p=0.016 n=5+5) MVCCGarbageCollect/rocksdb/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=1023-24 533ms ± 4% 514ms ± 1% ~ (p=0.093 n=6+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=2/deleteVersions=1-24 1.97µs ± 3% 2.29µs ± 2% +16.58% (p=0.002 n=6+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1-24 139µs ± 1% 5µs ± 6% -96.40% (p=0.004 n=5+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=16-24 140µs ± 1% 8µs ± 1% -94.13% (p=0.004 n=6+5) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=32-24 143µs ± 4% 11µs ± 2% -92.03% (p=0.002 n=6+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=512-24 178µs ± 9% 109µs ± 1% -38.75% (p=0.004 n=6+5) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1015-24 201µs ± 1% 213µs ± 1% +5.80% (p=0.008 n=5+5) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1/numVersions=1024/deleteVersions=1023-24 205µs ±11% 215µs ± 6% ~ (p=0.126 n=5+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=2/deleteVersions=1-24 1.43ms ± 1% 1.34ms ± 1% -5.82% (p=0.004 n=6+5) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=1-24 218ms ± 9% 9ms ± 2% -96.00% (p=0.002 n=6+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=16-24 216ms ± 3% 15ms ± 2% -93.19% (p=0.004 n=5+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=32-24 219ms ± 4% 20ms ± 5% -90.77% (p=0.004 n=5+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=512-24 303ms ± 4% 199ms ± 4% -34.47% (p=0.004 n=5+6) MVCCGarbageCollect/pebble/keySize=128/valSize=128/numKeys=1024/numVersions=1024/deleteVersions=1015-24 382ms ±16% 363ms ± 8% ~ (p=0.485 n=6+6) ajwerner@gceworker-ajwerner:~/go/src/github.com/cockroachdb/cockroach$ %ns=1024/deleteVersions=1023-24 363ms ± 4% 354ms ± 4% ~ (p=0.222 n=5+5) ``` Release note (performance improvement): Improved the efficiency of garbage collection when there are a large number of versions of a single key, commonly found when utilizing sequences. --- pkg/storage/mvcc.go | 126 +++++++++++++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 32 deletions(-) diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index e328cdd0f9c7..6d555f860a7f 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -17,6 +17,7 @@ import ( "math" "os" "path/filepath" + "sort" "sync" "time" @@ -3151,6 +3152,8 @@ func MVCCResolveWriteIntentRangeUsingIter( // keys slice. The iterator is seeked in turn to each listed // key, clearing all values with timestamps <= to expiration. The // timestamp parameter is used to compute the intent age on GC. +// +// Note that this method will be sorting the keys. func MVCCGarbageCollect( ctx context.Context, rw ReadWriter, @@ -3158,10 +3161,6 @@ func MVCCGarbageCollect( keys []roachpb.GCRequest_GCKey, timestamp hlc.Timestamp, ) error { - // We're allowed to use a prefix iterator because we always Seek() the - // iterator when handling a new user key. - iter := rw.NewIterator(IterOptions{Prefix: true}) - defer iter.Close() var count int64 defer func(begin time.Time) { @@ -3169,6 +3168,27 @@ func MVCCGarbageCollect( len(keys), float64(len(keys))*1e9/float64(timeutil.Since(begin)), count) }(timeutil.Now()) + // If there are no keys then there is no work. + if len(keys) == 0 { + return nil + } + + // Sort the slice to both determine the bounds and ensure that we're seeking + // in increasing order. + sort.Slice(keys, func(i, j int) bool { + iKey := MVCCKey{Key: keys[i].Key, Timestamp: keys[i].Timestamp} + jKey := MVCCKey{Key: keys[j].Key, Timestamp: keys[j].Timestamp} + return iKey.Less(jKey) + }) + + // Bound the iterator appropriately for the set of keys we'll be garbage + // collecting. + iter := rw.NewIterator(IterOptions{ + LowerBound: keys[0].Key, + UpperBound: keys[len(keys)-1].Key.Next(), + }) + defer iter.Close() + // Iterate through specified GC keys. meta := &enginepb.MVCCMetadata{} for _, gcKey := range keys { @@ -3222,19 +3242,63 @@ func MVCCGarbageCollect( iter.Next() } - // TODO(tschottdorf): Can't we just Seek() to a key with timestamp - // gcKey.Timestamp to avoid potentially cycling through a large prefix - // of versions we can't GC? The batching mechanism in the GC queue sends - // requests susceptible to that happening when there are lots of versions. - // A minor complication there will be that we need to know the first non- - // deletable value's timestamp (for prevNanos). - - // Now, iterate through all values, GC'ing ones which have expired. // For GCBytesAge, this requires keeping track of the previous key's // timestamp (prevNanos). See ComputeStatsGo for a more easily digested // and better commented version of this logic. - prevNanos := timestamp.WallTime + { + // If there are a large number of versions which are not garbage, + // iterating through all of them is very inefficient. However, if there + // are few, SeekLT is inefficient. Try to step the iterator a few times + // to find the predecessor of gcKey before resorting to seeking. + // + // In a synthetic benchmark where there is one version of garbage and one + // not, this optimization showed a 50% improvement. More importantly, + // this optimization mitigated the overhead of the Seek approach when + // almost all of the versions are garbage. + var foundPrevNanos bool + { + const nextsBeforeSeek = 4 + for i := 0; i < nextsBeforeSeek; i++ { + if ok, err := iter.Valid(); err != nil { + return err + } else if !ok { + break + } + unsafeIterKey := iter.UnsafeKey() + if !unsafeIterKey.Key.Equal(encKey.Key) { + break + } + if unsafeIterKey.Timestamp.LessEq(gcKey.Timestamp) { + foundPrevNanos = true + break + } + prevNanos = unsafeIterKey.Timestamp.WallTime + iter.Next() + } + } + + // Stepping with the iterator did not get us to our target garbage key or + // its predecessor. Seek to the predecessor to find the right value for + // prevNanos and position the iterator on the gcKey. + if !foundPrevNanos { + gcKeyMVCC := MVCCKey{Key: gcKey.Key, Timestamp: gcKey.Timestamp} + iter.SeekLT(gcKeyMVCC) + if ok, err := iter.Valid(); err != nil { + return err + } else if ok { + // Use the previous version's timestamp if it's for this key. + if iter.UnsafeKey().Key.Equal(gcKey.Key) { + prevNanos = iter.UnsafeKey().Timestamp.WallTime + } + // Seek to the first version for deletion. + iter.Next() + } + } + } + + // Iterate through the garbage versions, accumulating their stats and + // issuing clear operations. for ; ; iter.Next() { if ok, err := iter.Valid(); err != nil { return err @@ -3248,26 +3312,24 @@ func MVCCGarbageCollect( if !unsafeIterKey.IsValue() { break } - if unsafeIterKey.Timestamp.LessEq(gcKey.Timestamp) { - if ms != nil { - // FIXME: use prevNanos instead of unsafeIterKey.Timestamp, except - // when it's a deletion. - valSize := int64(len(iter.UnsafeValue())) - - // A non-deletion becomes non-live when its newer neighbor shows up. - // A deletion tombstone becomes non-live right when it is created. - fromNS := prevNanos - if valSize == 0 { - fromNS = unsafeIterKey.Timestamp.WallTime - } - - ms.Add(updateStatsOnGC(gcKey.Key, MVCCVersionTimestampSize, - valSize, nil, fromNS)) - } - count++ - if err := rw.Clear(unsafeIterKey); err != nil { - return err + if ms != nil { + // FIXME: use prevNanos instead of unsafeIterKey.Timestamp, except + // when it's a deletion. + valSize := int64(len(iter.UnsafeValue())) + + // A non-deletion becomes non-live when its newer neighbor shows up. + // A deletion tombstone becomes non-live right when it is created. + fromNS := prevNanos + if valSize == 0 { + fromNS = unsafeIterKey.Timestamp.WallTime } + + ms.Add(updateStatsOnGC(gcKey.Key, MVCCVersionTimestampSize, + valSize, nil, fromNS)) + } + count++ + if err := rw.Clear(unsafeIterKey); err != nil { + return err } prevNanos = unsafeIterKey.Timestamp.WallTime }