From 3472b33347d57f115dfa5a4f9deb05ae06081605 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 21 Dec 2017 12:44:21 -0500 Subject: [PATCH] engine: fix divergent MVCCStats updates Found while investigating #20554 (though this does not at all fix the issue discovered there; that will be a separate PR). We were incorrectly updating GCBytesAge when moving an intent. The old code was pretty broken (and, with hindsight, it is still after this commit, as is exposed by the various child commits). Its main problem was that it failed to account for the `GCBytesAge` difference that would result from moving the intent due to the incorrect assumption that the size of the intent would be the same. The code was also somewhat intransparent and an attempt has been made to improve its legibility. This change raises the question of what to do about the divergent stats that exist in real-world clusters. As part of addressing #20554, we'll need a mechanism to correct the stats anyway, and so I will defer its introduction. You'll want to view this diff with `?w=1` (insensitive to whitespace changes). Release note: None. --- pkg/storage/engine/mvcc.go | 38 ++- pkg/storage/engine/mvcc_stats_test.go | 330 +++++++++++++++----------- pkg/storage/engine/mvcc_test.go | 2 +- 3 files changed, 217 insertions(+), 153 deletions(-) diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index 13357cdc79ef..251bfb8234ab 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -302,32 +302,48 @@ func updateStatsOnResolve( commit bool, ) enginepb.MVCCStats { var ms enginepb.MVCCStats + + // NB: this is logging for ongoing work on #20554. + if false { + defer func() { + log.Infof(context.TODO(), "onResolve\n"+ + "orig: ts=%d metaKeySize=%d metaValSize=%d KeyBytes=%d ValBytes=%d\n"+ + "meta: ts=%d metaKeySize=%d metaValSize=%d KeyBytes=%d ValBytes=%d\n"+ + "%+v", + orig.Timestamp.WallTime, origMetaKeySize, origMetaValSize, orig.KeyBytes, orig.ValBytes, + meta.Timestamp.WallTime, metaKeySize, metaValSize, meta.KeyBytes, meta.ValBytes, + &ms) + }() + } + // In this case, we're only removing the contribution from having the // meta key around from orig.Timestamp to meta.Timestamp. ms.AgeTo(orig.Timestamp.WallTime) sys := isSysLocal(key) - // Always zero. - keyDiff := metaKeySize - origMetaKeySize - // This is going to be nonpositive: the old meta key was - // real, the new one is implicit. - valDiff := metaValSize - origMetaValSize - if sys { - ms.SysBytes += keyDiff + valDiff + ms.SysBytes += metaKeySize + metaValSize - origMetaValSize - origMetaKeySize + ms.AgeTo(meta.Timestamp.WallTime) } else { if !meta.Deleted { - ms.LiveBytes += keyDiff + valDiff + ms.LiveBytes += metaKeySize + metaValSize - origMetaValSize - origMetaKeySize } - ms.KeyBytes += keyDiff - ms.ValBytes += valDiff + // At orig.Timestamp, the original meta key disappears. + ms.KeyBytes -= origMetaKeySize + orig.KeyBytes + ms.ValBytes -= origMetaValSize + orig.ValBytes + // If committing, subtract out intent counts. if commit { ms.IntentBytes -= (meta.KeyBytes + meta.ValBytes) ms.IntentCount-- } + + ms.AgeTo(meta.Timestamp.WallTime) + + // At meta.Timestamp, the new meta key appears. + ms.KeyBytes += metaKeySize + meta.KeyBytes + ms.ValBytes += metaValSize + meta.ValBytes } - ms.AgeTo(meta.Timestamp.WallTime) return ms } diff --git a/pkg/storage/engine/mvcc_stats_test.go b/pkg/storage/engine/mvcc_stats_test.go index 60be9b95f169..4cc843cb0263 100644 --- a/pkg/storage/engine/mvcc_stats_test.go +++ b/pkg/storage/engine/mvcc_stats_test.go @@ -34,51 +34,117 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" ) -func verifyStats(debug string, ms, expMS *enginepb.MVCCStats, t *testing.T) { +// assertEq compares the given ms and expMS and errors when they don't match. It +// also recomputes the stats over the whole engine with all known +// implementations and errors on mismatch with any of them. +func assertEq(t *testing.T, engine Engine, debug string, ms, expMS *enginepb.MVCCStats) { t.Helper() + if !ms.Equal(expMS) { pretty.Ldiff(t, ms, expMS) t.Errorf("%s: mismatch detected", debug) } + + it := engine.NewIterator(false) + defer it.Close() + from, to := MVCCKey{}, MVCCKey{Key: roachpb.KeyMax} + + for _, mvccStatsTest := range mvccStatsTests { + compMS, err := mvccStatsTest.fn(it, from, to, ms.LastUpdateNanos) + if err != nil { + t.Fatal(err) + } + if !compMS.Equal(*ms) { + t.Errorf("%s: diff(ms, %s) = %s", debug, mvccStatsTest.name, pretty.Diff(*ms, compMS)) + } + } + + if t.Failed() { + t.FailNow() + } } -// TestMVCCStatsBasic writes a value, then deletes it as an intent via -// a transaction, then resolves the intent, manually verifying the -// mvcc stats at each step. -func TestMVCCStatsBasic(t *testing.T) { +// TestMVCCStatsResolveMovesTimestamp exercises the case in which an intent has its timestamp changed +// and commits. +func TestMVCCStatsResolveMovesTimestamp(t *testing.T) { defer leaktest.AfterTest(t)() engine := createTestEngine() defer engine.Close() - ms := &enginepb.MVCCStats{} + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} - assertEq := func(debug string, ms, expMS *enginepb.MVCCStats) { - t.Helper() + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) - verifyStats(debug, ms, expMS, t) + key := roachpb.Key("a") + ts1 := hlc.Timestamp{WallTime: 1E9} + // Put a value. + value := roachpb.MakeValueFromString("value") + if err := MVCCPut(context.Background(), engine, aggMS, key, ts1, value, nil); err != nil { + t.Fatal(err) + } - it := engine.NewIterator(false) - defer it.Close() - from, to := MVCCKey{}, MVCCKey{Key: roachpb.KeyMax} + mKeySize := int64(mvccKey(key).EncodedSize()) // 2 + vKeySize := mvccVersionTimestampSize // 12 + vValSize := int64(len(value.RawBytes)) // 10 - for _, mvccStatsTest := range mvccStatsTests { - compMS, err := mvccStatsTest.fn(it, from, to, ms.LastUpdateNanos) - if err != nil { - t.Fatal(err) - } - if !compMS.Equal(*ms) { - // TODO(tschottdorf): This unfortunately fails. Upgrade this to - // t.Errorf when the computation has been fixed. - t.Logf("%s: diff(ms, %s) = %s", debug, mvccStatsTest.name, pretty.Diff(*ms, compMS)) - } - } + log.Infof(ctx, "mKeySize=%d vKeySize=%d vValSize=%d", mKeySize, vKeySize, vValSize) - if t.Failed() { - t.FailNow() - } + expMS := enginepb.MVCCStats{ + LiveBytes: mKeySize + vKeySize + vValSize, // 24 + LiveCount: 1, + KeyBytes: mKeySize + vKeySize, // 14 + KeyCount: 1, + ValBytes: vValSize, // 10 + ValCount: 1, + LastUpdateNanos: 1E9, } + assertEq(t, engine, "after put", aggMS, &expMS) + + // Delete the value at ts=3. We'll commit this at ts=4 later. + ts3 := hlc.Timestamp{WallTime: 3 * 1E9} + txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts3}} + txn.Timestamp.Forward(ts3) + if err := MVCCDelete(context.Background(), engine, aggMS, key, ts3, txn); err != nil { + t.Fatal(err) + } + + // Now commit the value, but with a timestamp gap (i.e. this is a + // push-commit as it would happen for a SNAPSHOT txn) + ts4 := hlc.Timestamp{WallTime: 4 * 1E9} + txn.Status = roachpb.COMMITTED + txn.Timestamp.Forward(ts4) + if err := MVCCResolveWriteIntent(context.Background(), engine, aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 4E9, + LiveBytes: 0, + LiveCount: 0, + KeyCount: 1, + ValCount: 2, + // The implicit meta record (deletion tombstone) counts for len("a")+1=2. + // Two versioned keys count for 2*vKeySize. + KeyBytes: mKeySize + 2*vKeySize, + ValBytes: vValSize, // the initial write (10)... + GCBytesAge: (vValSize + vKeySize) * 3, // ... along with the value (12) aged over 3s, a total of 66 + } + + assertEq(t, engine, "after committing", aggMS, &expAggMS) +} - assertEq("initially", ms, &enginepb.MVCCStats{}) +// TestMVCCStatsBasic writes a value, then deletes it as an intent via +// a transaction, then resolves the intent, manually verifying the +// mvcc stats at each step. +func TestMVCCStatsBasic(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ms := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", ms, &enginepb.MVCCStats{}) // Verify size of mvccVersionTimestampSize. ts := hlc.Timestamp{WallTime: 1 * 1E9} @@ -106,7 +172,7 @@ func TestMVCCStatsBasic(t *testing.T) { ValCount: 1, LastUpdateNanos: 1E9, } - assertEq("after put", ms, &expMS) + assertEq(t, engine, "after put", ms, &expMS) if e, a := int64(0), ms.GCBytes(); e != a { t.Fatalf("GCBytes: expected %d, got %d", e, a) } @@ -138,7 +204,7 @@ func TestMVCCStatsBasic(t *testing.T) { GCBytesAge: vValSize + vKeySize, // immediately recognizes GC'able bytes from old value at 1E9 LastUpdateNanos: 2E9, } - assertEq("after delete", ms, &expMS2) + assertEq(t, engine, "after delete", ms, &expMS2) // This is expMS2.KeyBytes + expMS2.ValBytes - expMS2.LiveBytes expGC2 := mKeySize + vKeySize + v2KeySize + m2ValSize + vValSize + v2ValSize if a := ms.GCBytes(); expGC2 != a { @@ -153,7 +219,7 @@ func TestMVCCStatsBasic(t *testing.T) { } // Stats should equal same as before the deletion after aborting the intent. expMS.LastUpdateNanos = 2E9 - assertEq("after abort", ms, &expMS) + assertEq(t, engine, "after abort", ms, &expMS) // Re-delete, but this time, we're going to commit it. txn.Status = roachpb.PENDING @@ -165,7 +231,7 @@ func TestMVCCStatsBasic(t *testing.T) { // GCBytesAge will now count the deleted value from ts=1E9 to ts=3E9. expMS2.GCBytesAge = (vValSize + vKeySize) * 2 expMS2.LastUpdateNanos = 3E9 - assertEq("after 2nd delete", ms, &expMS2) // should be same as before. + assertEq(t, engine, "after 2nd delete", ms, &expMS2) // should be same as before. if a := ms.GCBytes(); expGC2 != a { t.Fatalf("GCBytes: expected %d, got %d", expGC2, a) } @@ -204,7 +270,7 @@ func TestMVCCStatsBasic(t *testing.T) { } expGC3 := expGC2 // no change, didn't delete anything - assertEq("after 2nd put", ms, &expMS3) + assertEq(t, engine, "after 2nd put", ms, &expMS3) if a := ms.GCBytes(); expGC3 != a { t.Fatalf("GCBytes: expected %d, got %d", expGC3, a) } @@ -224,21 +290,14 @@ func TestMVCCStatsBasic(t *testing.T) { IntentBytes: vKey2Size + vVal2Size, IntentCount: 1, // The commit turned the explicit deletion intent meta back into an - // implicit one; so we see the originally written value which is now 3s - // old, plus the implicit meta key contribution (basically the key - // prefix) along with the deletion tombstone kv pair. - // You could also write this as - // (vValSize+vKeySize)*2 + (expGC3 - m2ValSize) - // as we do in the second commit. - // - // TODO(tschottdorf): the computation here is wrong; the result should be: - // GCBytesAge: (vValSize + vKeySize) * 3, - // The new deletion tombstone entry is at t=4 and so it contributes nothing to - // GCBytesAge just yet. - GCBytesAge: (vValSize+vKeySize)*3 + (mKeySize + v2ValSize + v2KeySize), + // implicit one, so we see the originally written value which is now 3s + // old. There is no contribution from the deletion tombstone yet as it + // moved from 3s (inside the meta) to 4s (implicit meta), the current + // time. + GCBytesAge: (vValSize + vKeySize) * 3, LastUpdateNanos: 4E9, } - assertEq("after first commit", ms, &expMS4) + assertEq(t, engine, "after first commit", ms, &expMS4) // With commit of the deletion intent, what really happens is that the // explicit meta (carrying the intent) becomes implicit (so its key @@ -259,10 +318,10 @@ func TestMVCCStatsBasic(t *testing.T) { LiveBytes: mKey2Size + vKey2Size + vVal2Size, LiveCount: 1, IntentAge: 0, - GCBytesAge: (vValSize+vKeySize)*2 + (expGC3 - m2ValSize), + GCBytesAge: (vValSize + vKeySize) * 3, // unchanged; still at 4s LastUpdateNanos: 4E9, } - assertEq("after second commit", ms, &expMS4) + assertEq(t, engine, "after second commit", ms, &expMS4) if a := ms.GCBytes(); expGC4 != a { // no change here t.Fatalf("GCBytes: expected %d, got %d", expGC4, a) } @@ -279,7 +338,7 @@ func TestMVCCStatsBasic(t *testing.T) { // The age increases: 6 seconds for each key2 and key. expMS5.GCBytesAge += (vKey2Size+vVal2Size)*6 + expGC4*6 expMS5.LastUpdateNanos = 10E9 - assertEq("after overwrite", ms, &expMS5) + assertEq(t, engine, "after overwrite", ms, &expMS5) // Write a transaction record which is a system-local key. txnKey := keys.TransactionKey(txn.Key, txn.ID) @@ -292,7 +351,7 @@ func TestMVCCStatsBasic(t *testing.T) { expMS6 := expMS5 expMS6.SysBytes += txnKeySize + txnValSize expMS6.SysCount++ - assertEq("after sys-local key", ms, &expMS6) + assertEq(t, engine, "after sys-local key", ms, &expMS6) } var mvccStatsTests = []struct { @@ -322,112 +381,101 @@ func TestMVCCStatsWithRandomRuns(t *testing.T) { seed := randutil.NewPseudoSeed() log.Infof(context.Background(), "using pseudo random number generator with seed %d", seed) - for _, mvccStatsTest := range mvccStatsTests { - t.Run(mvccStatsTest.name, func(t *testing.T) { - rng := rand.New(rand.NewSource(seed)) + rng := rand.New(rand.NewSource(seed)) - engine := createTestEngine() - defer engine.Close() - - ms := &enginepb.MVCCStats{} + engine := createTestEngine() + defer engine.Close() - // Now, generate a random sequence of puts, deletes and resolves. - // Each put and delete may or may not involve a txn. Resolves may - // either commit or abort. - keys := map[int32][]byte{} - var lastWT int64 - for i := int32(0); i < int32(1000); i++ { - // Create random future timestamp, up to a few seconds ahead. - ts := hlc.Timestamp{WallTime: lastWT + int64(rng.Float32()*4E9), Logical: int32(rng.Int())} - lastWT = ts.WallTime + ms := &enginepb.MVCCStats{} - if log.V(1) { - log.Infof(context.Background(), "*** cycle %d @ %s", i, ts) - } - // Manually advance aggregate intent age based on one extra second of simulation. - // Same for aggregate gc'able bytes age. - key := []byte(fmt.Sprintf("%s-%d", randutil.RandBytes(rng, int(rng.Int31n(32))), i)) - keys[i] = key - - var txn *roachpb.Transaction - if rng.Int31n(2) == 0 { // create a txn with 50% prob - txn = &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts}} - } - // With 25% probability, put a new value; otherwise, delete an earlier - // key. Because an earlier step in this process may have itself been - // a delete, we could end up deleting a non-existent key, which is good; - // we don't mind testing that case as well. - isDelete := rng.Int31n(4) == 0 - if i > 0 && isDelete { - idx := rng.Int31n(i) - if log.V(1) { - log.Infof(context.Background(), "*** DELETE index %d", idx) - } - if err := MVCCDelete(context.Background(), engine, ms, keys[idx], ts, txn); err != nil { - // Abort any write intent on an earlier, unresolved txn. - if wiErr, ok := err.(*roachpb.WriteIntentError); ok { - wiErr.Intents[0].Status = roachpb.ABORTED - if log.V(1) { - log.Infof(context.Background(), "*** ABORT index %d", idx) - } - // Note that this already incorporates committing an intent - // at a later time (since we use a potentially later ts here - // for the resolution). - if err := MVCCResolveWriteIntent(context.Background(), engine, ms, wiErr.Intents[0]); err != nil { - t.Fatal(err) - } - // Now, re-delete. - if log.V(1) { - log.Infof(context.Background(), "*** RE-DELETE index %d", idx) - } - if err := MVCCDelete(context.Background(), engine, ms, keys[idx], ts, txn); err != nil { - t.Fatal(err) - } - } else { - t.Fatal(err) - } - } - } else { - rngVal := roachpb.MakeValueFromBytes(randutil.RandBytes(rng, int(rng.Int31n(128)))) + // Now, generate a random sequence of puts, deletes and resolves. + // Each put and delete may or may not involve a txn. Resolves may + // either commit or abort. + keys := map[int32][]byte{} + var lastWT int64 + for i := int32(0); i < int32(1000); i++ { + // Create random future timestamp, up to a few seconds ahead. + ts := hlc.Timestamp{WallTime: lastWT + int64(rng.Float32()*4E9), Logical: int32(rng.Int())} + lastWT = ts.WallTime + + if log.V(1) { + log.Infof(context.Background(), "*** cycle %d @ %s", i, ts) + } + // Manually advance aggregate intent age based on one extra second of simulation. + // Same for aggregate gc'able bytes age. + key := []byte(fmt.Sprintf("%s-%d", randutil.RandBytes(rng, int(rng.Int31n(32))), i)) + keys[i] = key + + var txn *roachpb.Transaction + if rng.Int31n(2) == 0 { // create a txn with 50% prob + txn = &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts}} + } + // With 25% probability, put a new value; otherwise, delete an earlier + // key. Because an earlier step in this process may have itself been + // a delete, we could end up deleting a non-existent key, which is good; + // we don't mind testing that case as well. + isDelete := rng.Int31n(4) == 0 + if i > 0 && isDelete { + idx := rng.Int31n(i) + if log.V(1) { + log.Infof(context.Background(), "*** DELETE index %d", idx) + } + if err := MVCCDelete(context.Background(), engine, ms, keys[idx], ts, txn); err != nil { + // Abort any write intent on an earlier, unresolved txn. + if wiErr, ok := err.(*roachpb.WriteIntentError); ok { + wiErr.Intents[0].Status = roachpb.ABORTED if log.V(1) { - log.Infof(context.Background(), "*** PUT index %d; TXN=%t", i, txn != nil) + log.Infof(context.Background(), "*** ABORT index %d", idx) } - if err := MVCCPut(context.Background(), engine, ms, key, ts, rngVal, txn); err != nil { + // Note that this already incorporates committing an intent + // at a later time (since we use a potentially later ts here + // for the resolution). + if err := MVCCResolveWriteIntent(context.Background(), engine, ms, wiErr.Intents[0]); err != nil { t.Fatal(err) } - } - if !isDelete && txn != nil && rng.Int31n(2) == 0 { // resolve txn with 50% prob - // TODO(tschottdorf): need to simulate resolving at a pushed timestamp. - txn.Status = roachpb.COMMITTED - if rng.Int31n(10) == 0 { // abort txn with 10% prob - txn.Status = roachpb.ABORTED - } + // Now, re-delete. if log.V(1) { - log.Infof(context.Background(), "*** RESOLVE index %d; COMMIT=%t", i, txn.Status == roachpb.COMMITTED) + log.Infof(context.Background(), "*** RE-DELETE index %d", idx) } - if err := MVCCResolveWriteIntent(context.Background(), engine, ms, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + if err := MVCCDelete(context.Background(), engine, ms, keys[idx], ts, txn); err != nil { t.Fatal(err) } + } else { + t.Fatal(err) } + } + } else { + rngVal := roachpb.MakeValueFromBytes(randutil.RandBytes(rng, int(rng.Int31n(128)))) + if log.V(1) { + log.Infof(context.Background(), "*** PUT index %d; TXN=%t", i, txn != nil) + } + if err := MVCCPut(context.Background(), engine, ms, key, ts, rngVal, txn); err != nil { + t.Fatal(err) + } + } + if !isDelete && txn != nil && rng.Int31n(2) == 0 { // resolve txn with 50% prob + // TODO(tschottdorf): need to simulate resolving at a pushed timestamp. + txn.Status = roachpb.COMMITTED + if rng.Int31n(10) == 0 { // abort txn with 10% prob + txn.Status = roachpb.ABORTED + } + if log.V(1) { + log.Infof(context.Background(), "*** RESOLVE index %d; COMMIT=%t", i, txn.Status == roachpb.COMMITTED) + } + if err := MVCCResolveWriteIntent(context.Background(), engine, ms, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + t.Fatal(err) + } + } - ms.AgeTo(ts.WallTime) // a noop may not have updated the stats - // Every 10th step, verify the stats via manual engine scan. - if i%10 == 0 { - // Compute the stats manually. - iter := engine.NewIterator(false) - expMS, err := mvccStatsTest.fn(iter, mvccKey(roachpb.KeyMin), - mvccKey(roachpb.KeyMax), ts.WallTime) - iter.Close() - if err != nil { - t.Fatal(err) - } - verifyStats(fmt.Sprintf("cycle %d", i), ms, &expMS, t) - if t.Failed() { - t.Fatal("giving up") - } - } + ms.AgeTo(ts.WallTime) // a noop may not have updated the stats + // Every 10th step, verify the stats via manual engine scan. + if i%10 == 0 { + // Recompute the stats and compare. + assertEq(t, engine, fmt.Sprintf("cycle %d", i), ms, ms) + if t.Failed() { + t.Fatal("giving up") } - }) + } } } diff --git a/pkg/storage/engine/mvcc_test.go b/pkg/storage/engine/mvcc_test.go index dd9ee54a5a1b..1db31cbbba3a 100644 --- a/pkg/storage/engine/mvcc_test.go +++ b/pkg/storage/engine/mvcc_test.go @@ -3514,7 +3514,7 @@ func TestMVCCGarbageCollect(t *testing.T) { if err != nil { t.Fatal(err) } - verifyStats("verification", ms, &expMS, t) + assertEq(t, engine, "verification", ms, &expMS) }) } }