diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index 08958ca4ce4f..f67275cdf768 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -2356,6 +2356,11 @@ MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DB cockroach::storage::engine::enginepb::MVCCMetadata meta; std::string prev_key; bool first = false; + // NB: making this uninitialized triggers compiler warnings + // with `-Werror=maybe-uninitialized`. This warning seems like + // a false positive (changing the above line to `first=false` + // which results in equivalent code does not remove it either). + int64_t accrue_gc_age_nanos = 0; for (; iter_rep->Valid() && kComparator.Compare(iter_rep->key(), end_key) < 0; iter_rep->Next()) { const rocksdb::Slice key = iter_rep->key(); @@ -2366,7 +2371,7 @@ MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DB int32_t logical = 0; if (!DecodeKey(key, &decoded_key, &wall_time, &logical)) { stats.status = FmtStatus("unable to decode key"); - break; + return stats; } const bool isSys = (rocksdb::Slice(decoded_key).compare(kLocalMax) < 0); @@ -2391,7 +2396,7 @@ MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DB if (!implicitMeta && !meta.ParseFromArray(value.data(), value.size())) { stats.status = FmtStatus("unable to decode MVCCMetadata"); - break; + return stats; } if (isSys) { @@ -2435,15 +2440,22 @@ MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DB if (meta.key_bytes() != kMVCCVersionTimestampSize) { stats.status = FmtStatus("expected mvcc metadata key bytes to equal %d; got %d", kMVCCVersionTimestampSize, int(meta.key_bytes())); - break; + return stats; } if (meta.val_bytes() != value.size()) { stats.status = FmtStatus("expected mvcc metadata val bytes to equal %d; got %d", int(value.size()), int(meta.val_bytes())); - break; + return stats; } + accrue_gc_age_nanos = meta.timestamp().wall_time(); } else { - stats.gc_bytes_age += total_bytes * age_factor(wall_time, now_nanos); + bool is_tombstone = value.size() == 0; + if (is_tombstone) { + stats.gc_bytes_age += total_bytes * age_factor(wall_time, now_nanos); + } else { + stats.gc_bytes_age += total_bytes * age_factor(accrue_gc_age_nanos, now_nanos); + } + accrue_gc_age_nanos = wall_time; } stats.key_bytes += kMVCCVersionTimestampSize; stats.val_bytes += value.size(); diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index ff3c43fd9b93..9c05af4e6c3a 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -224,16 +224,19 @@ func updateStatsOnMerge(key roachpb.Key, valSize, nowNanos int64) enginepb.MVCCS // If this value is an intent, updates the intent counters. func updateStatsOnPut( key roachpb.Key, + prevValSize int64, origMetaKeySize, origMetaValSize, metaKeySize, metaValSize int64, orig, meta *enginepb.MVCCMetadata, ) enginepb.MVCCStats { var ms enginepb.MVCCStats - sys := isSysLocal(key) - // Remove current live counts for this key. - if orig != nil { - if sys { + if isSysLocal(key) { + // Handling system-local keys is straightforward because + // we don't track ageable quantities for them (we + // could, but don't). Remove the contributions from the + // original, if any, and add in the new contributions. + if orig != nil { ms.SysBytes -= origMetaKeySize + origMetaValSize if orig.Txn != nil { // If the original value was an intent, we're replacing the @@ -242,67 +245,137 @@ func updateStatsOnPut( ms.SysBytes -= orig.KeyBytes + orig.ValBytes } ms.SysCount-- - } else { - // Move the (so far empty) stats to the timestamp at which the - // previous entry was created, which is where we wish to reclassify - // its contributions. - ms.AgeTo(orig.Timestamp.WallTime) - // If original version value for this key wasn't deleted, subtract - // its contribution from live bytes in anticipation of adding in - // contribution from new version below. - if !orig.Deleted { - ms.LiveBytes -= orig.KeyBytes + orig.ValBytes + origMetaKeySize + origMetaValSize - ms.LiveCount-- - // Also, add the bytes from overwritten value to the GC'able bytes age stat. - } + } + ms.SysBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize + ms.SysCount++ + return ms + } + + // Handle non-sys keys. This follows the same scheme: if there was a previous + // value, perhaps even an intent, subtract its contributions, and then add the + // new contributions. The complexity here is that we need to properly update + // GCBytesAge and IntentAge, which don't follow the same semantics. The difference + // between them is that an intent accrues IntentAge from its own timestamp on, + // while GCBytesAge is accrued by versions according to the following rules: + // 1. a (non-tombstone) value that is shadowed by a newer write accrues age at + // the point in time at which it is shadowed (i.e. the newer write's timestamp). + // 2. a tombstone value accrues age at its own timestamp (note that this means + // the tombstone's own contribution only -- the actual write that was deleted + // is then shadowed by this tombstone, and will thus also accrue age from + // the tombstone's value on, as per 1). + // + // This seems relatively straightforward, but only because it omits pesky + // details, which have been relegated to the comments below. + + // Remove current live counts for this key. + if orig != nil { + ms.KeyCount-- + + // Move the (so far empty) stats to the timestamp at which the + // previous entry was created, which is where we wish to reclassify + // its contributions. + ms.AgeTo(orig.Timestamp.WallTime) + + // If the original metadata for this key was an intent, subtract + // its contribution from stat counters as it's being replaced. + if orig.Txn != nil { + // Subtract counts attributable to intent we're replacing. + ms.ValCount-- + ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes) + ms.IntentCount-- + } + // If the original intent is a deletion, we're removing the intent. This + // means removing its contribution at the *old* timestamp because it has + // accrued GCBytesAge that we need to offset (rule 2). + // + // Note that there is a corresponding block for the case of a non-deletion + // (rule 1) below, at meta.Timestamp. + if orig.Deleted { ms.KeyBytes -= origMetaKeySize ms.ValBytes -= origMetaValSize - ms.KeyCount-- - // If the original metadata for this key was an intent, subtract - // its contribution from stat counters as it's being replaced. + if orig.Txn != nil { - // Subtract counts attributable to intent we're replacing. ms.KeyBytes -= orig.KeyBytes ms.ValBytes -= orig.ValBytes - ms.ValCount-- - ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes) - ms.IntentCount-- } } - } - // Move the stats to the new meta's timestamp. If we had an orig meta, this - // ages those original stats by the time which the previous version was - // live. - // - // Note that there is an interesting special case here: it's possible that - // meta.Timestamp.WallTime < orig.Timestamp.WallTime. This wouldn't happen - // outside of tests (due to our semantics of txn.OrigTimestamp, which never - // decreases) but it sure does happen in randomized testing. An earlier - // version of the code used `Forward` here, which is incorrect as it would be - // a no-op and fail to subtract out the intent bytes/GC age incurred due to - // removing the meta entry at `orig.Timestamp` (when `orig != nil`). - ms.AgeTo(meta.Timestamp.WallTime) + // Rule 1 implies that sometimes it's not only the old meta and the new meta + // that matter, but also the version below both of them. For example, take + // a version at t=1 and an intent over it at t=2 that is now being replaced + // (t=3). Then orig.Timestamp will be 2, and meta.Timestamp will be 3, but + // rule 1 tells us that for the interval [2,3) we have already accrued + // GCBytesAge for the version at t=1 that is now moot, because the intent + // at t=2 is moving to t=3; we have to emit a GCBytesAge offset to that effect. + // + // The code below achieves this by making the old version live again at + // orig.Timestamp, and then marking it as shadowed at meta.Timestamp below. + // This only happens when that version wasn't a tombstone, in which case it + // contributes from its own timestamp on anyway, and doesn't need adjustment. + // + // Note that when meta.Timestamp equals orig.Timestamp, the computation is + // moot, which is something our callers may exploit (since retrieving the + // previous version is not for free). + prevIsValue := prevValSize > 0 + + if prevIsValue { + // If the previous value (exists and) was not a deletion tombstone, make it + // live at orig.Timestamp. We don't have to do anything if there is a + // previous value that is a tombstone: according to rule two its age + // contributions are anchored to its own timestamp, so moving some values + // higher up doesn't affect the contributions tied to that key. + ms.LiveBytes += mvccVersionTimestampSize + prevValSize + } + + // Note that there is an interesting special case here: it's possible that + // meta.Timestamp.WallTime < orig.Timestamp.WallTime. This wouldn't happen + // outside of tests (due to our semantics of txn.OrigTimestamp, which never + // decreases) but it sure does happen in randomized testing. An earlier + // version of the code used `Forward` here, which is incorrect as it would be + // a no-op and fail to subtract out the intent bytes/GC age incurred due to + // removing the meta entry at `orig.Timestamp` (when `orig != nil`). + ms.AgeTo(meta.Timestamp.WallTime) - if sys { - ms.SysBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize - ms.SysCount++ - } else { - // If new version isn't a deletion tombstone, add it to live counters. - if !meta.Deleted { - ms.LiveBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize - ms.LiveCount++ + if prevIsValue { + // Make the previous non-deletion value non-live again, as explained in the + // sibling block above. + ms.LiveBytes -= mvccVersionTimestampSize + prevValSize } - ms.KeyBytes += meta.KeyBytes + metaKeySize - ms.ValBytes += meta.ValBytes + metaValSize - ms.KeyCount++ - ms.ValCount++ - if meta.Txn != nil { - ms.IntentBytes += meta.KeyBytes + meta.ValBytes - ms.IntentCount++ + + // If the original version wasn't a deletion, it becomes non-live at meta.Timestamp + // as this is where it is shadowed. + if !orig.Deleted { + ms.LiveBytes -= orig.KeyBytes + orig.ValBytes + ms.LiveBytes -= origMetaKeySize + origMetaValSize + ms.LiveCount-- + + ms.KeyBytes -= origMetaKeySize + ms.ValBytes -= origMetaValSize + + if orig.Txn != nil { + ms.KeyBytes -= orig.KeyBytes + ms.ValBytes -= orig.ValBytes + } } + } else { + ms.AgeTo(meta.Timestamp.WallTime) + } + + // If the new version isn't a deletion tombstone, add it to live counters. + if !meta.Deleted { + ms.LiveBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize + ms.LiveCount++ + } + ms.KeyBytes += meta.KeyBytes + metaKeySize + ms.ValBytes += meta.ValBytes + metaValSize + ms.KeyCount++ + ms.ValCount++ + if meta.Txn != nil { + ms.IntentBytes += meta.KeyBytes + meta.ValBytes + ms.IntentCount++ } + return ms } @@ -312,6 +385,7 @@ func updateStatsOnPut( // counters if commit=true. func updateStatsOnResolve( key roachpb.Key, + prevValSize int64, origMetaKeySize, origMetaValSize, metaKeySize, metaValSize int64, orig, meta enginepb.MVCCMetadata, @@ -319,68 +393,93 @@ func updateStatsOnResolve( ) enginepb.MVCCStats { var ms enginepb.MVCCStats - // NB: this is logging for ongoing work on #20554. - if false { - defer func() { - log.Infof(context.TODO(), "onResolve\n"+ - "orig: ts=%d metaKeySize=%d metaValSize=%d KeyBytes=%d ValBytes=%d\n"+ - "meta: ts=%d metaKeySize=%d metaValSize=%d KeyBytes=%d ValBytes=%d\n"+ - "%+v", - orig.Timestamp.WallTime, origMetaKeySize, origMetaValSize, orig.KeyBytes, orig.ValBytes, - meta.Timestamp.WallTime, metaKeySize, metaValSize, meta.KeyBytes, meta.ValBytes, - &ms) - }() - } - - // In this case, we're only removing the contribution from having the - // meta key around from orig.Timestamp to meta.Timestamp. - ms.AgeTo(orig.Timestamp.WallTime) - sys := isSysLocal(key) + if isSysLocal(key) { + // Straightforward: old contribution goes, new contribution comes, and we're done. + ms.SysBytes += (metaKeySize + metaValSize) - (origMetaValSize + origMetaKeySize) + return ms + } + // An intent can't turn from deleted to non-deleted and vice versa while being + // resolved. if orig.Deleted != meta.Deleted { log.Fatalf(context.TODO(), "on resolve, original meta was deleted=%t, but new one is deleted=%t", orig.Deleted, meta.Deleted) } - if sys { - ms.SysBytes += (metaKeySize + metaValSize) - (origMetaValSize + origMetaKeySize) - } else { - // At orig.Timestamp, the original meta key disappears. - ms.KeyBytes -= origMetaKeySize + orig.KeyBytes - ms.ValBytes -= origMetaValSize + orig.ValBytes + // In the main case, we had an old intent at orig.Timestamp, and a new intent + // or value at meta.Timestamp. We'll walk through the contributions below, + // taking special care for IntentAge and GCBytesAge. + // + // Jump into the method below for extensive commentary on their semantics + // and "rules one and two". + _ = updateStatsOnPut - ms.IntentBytes -= orig.KeyBytes + orig.ValBytes - ms.IntentCount-- + ms.AgeTo(orig.Timestamp.WallTime) - // If the old intent is a deletion, then the key already isn't tracked - // in LiveBytes any more (and the new intent/value is also a deletion). - // If we're looking at a non-deletion intent/value, update the live - // bytes to account for the difference between the previous intent and - // the new intent/value. - if !meta.Deleted { - ms.LiveBytes -= (origMetaKeySize + origMetaValSize) + (orig.KeyBytes + meta.ValBytes) - } + // At orig.Timestamp, the original meta key disappears. Fortunately, the + // GCBytesAge computations are fairly transparent because the intent is either + // not a deletion in which case it is always live (it's the most recent value, + // so it isn't shadowed -- see rule 1), or it *is* a deletion, in which case + // its own timestamp is where it starts accruing GCBytesAge (rule 2). + ms.KeyBytes -= origMetaKeySize + orig.KeyBytes + ms.ValBytes -= origMetaValSize + orig.ValBytes + + // If the old intent is a deletion, then the key already isn't tracked + // in LiveBytes any more (and the new intent/value is also a deletion). + // If we're looking at a non-deletion intent/value, update the live + // bytes to account for the difference between the previous intent and + // the new intent/value. + if !meta.Deleted { + ms.LiveBytes -= origMetaKeySize + origMetaValSize + ms.LiveBytes -= orig.KeyBytes + meta.ValBytes + } + + // IntentAge is always accrued from the intent's own timestamp on. + ms.IntentBytes -= orig.KeyBytes + orig.ValBytes + ms.IntentCount-- + + // If there was a previous value (before orig.Timestamp), and it was not a + // deletion tombstone, then we have to adjust its GCBytesAge contribution + // which was previously anchored at orig.Timestamp and now has to move to + // meta.Timestamp. Paralleling very similar code in the method below, this + // is achieved by making the previous key live between orig.Timestamp and + // meta.Timestamp. When the two are equal, this will be a zero adjustment, + // and so in that case the caller may simply pass prevValSize=0 and can + // skip computing that quantity in the first place. + _ = updateStatsOnPut + prevIsValue := prevValSize > 0 + + if prevIsValue { + ms.LiveBytes += mvccVersionTimestampSize + prevValSize + } - ms.AgeTo(meta.Timestamp.WallTime) + ms.AgeTo(meta.Timestamp.WallTime) - // At meta.Timestamp, the new meta key appears. - ms.KeyBytes += metaKeySize + meta.KeyBytes - ms.ValBytes += metaValSize + meta.ValBytes + if prevIsValue { + // The previous non-deletion value becomes non-live at meta.Timestamp. + // See the sibling code above. + ms.LiveBytes -= mvccVersionTimestampSize + prevValSize + } - if !commit { - // If not committing, the intent reappears (but at meta.Timestamp). - // - // This is the case in which an intent is pushed (a similar case - // happens when an intent is overwritten, but that's handled in - // updateStatsOnPut, not this method). - ms.IntentBytes += meta.KeyBytes + meta.ValBytes - ms.IntentCount++ - } + // At meta.Timestamp, the new meta key appears. + ms.KeyBytes += metaKeySize + meta.KeyBytes + ms.ValBytes += metaValSize + meta.ValBytes - if !meta.Deleted { - ms.LiveBytes += (metaKeySize + metaValSize) + (meta.KeyBytes + meta.ValBytes) - } + // The new meta key appears. + if !meta.Deleted { + ms.LiveBytes += (metaKeySize + metaValSize) + (meta.KeyBytes + meta.ValBytes) + } + + if !commit { + // If not committing, the intent reappears (but at meta.Timestamp). + // + // This is the case in which an intent is pushed (a similar case + // happens when an intent is overwritten, but that's handled in + // updateStatsOnPut, not this method). + ms.IntentBytes += meta.KeyBytes + meta.ValBytes + ms.IntentCount++ } + return ms } @@ -393,56 +492,74 @@ func updateStatsOnAbort( origMetaKeySize, origMetaValSize, restoredMetaKeySize, restoredMetaValSize int64, orig, restored *enginepb.MVCCMetadata, - restoredNanos, txnNanos int64, + restoredNanos int64, ) enginepb.MVCCStats { - sys := isSysLocal(key) - var ms enginepb.MVCCStats - // Three epochs of time here: - // 1) creation of previous value (or 0) to creation of intent: - // [restoredNanos, orig.Timestamp.WallTime) - // 2) creation of the intent (which we're now aborting) to the timestamp - // at which we're aborting: - // [orig.Timestamp.WallTime, txnNanos) - if restored != nil { - ms.AgeTo(restoredNanos) - if sys { + if isSysLocal(key) { + if restored != nil { ms.SysBytes += restoredMetaKeySize + restoredMetaValSize ms.SysCount++ - } else { - if !restored.Deleted { - ms.LiveBytes += restored.KeyBytes + restored.ValBytes + restoredMetaKeySize + restoredMetaValSize - ms.LiveCount++ - } + } + + ms.SysBytes -= (orig.KeyBytes + orig.ValBytes) + (origMetaKeySize + origMetaValSize) + ms.SysCount-- + return ms + } + + // If we're restoring a previous value (which is thus not an intent), there are + // two main cases: + // + // 1. the previous value is a tombstone, so according to rule 2 it accrues + // GCBytesAge from its own timestamp on (we need to adjust only for the + // implicit meta key that "pops up" at that timestamp), -- or -- + // 2. it is not, and it has been shadowed by the intent we're now aborting, + // in which case we need to offset its GCBytesAge contribution from + // restoredNanos to orig.Timestamp (rule 1). + if restored != nil { + if restored.Txn != nil { + panic("restored version should never be an intent") + } + + ms.AgeTo(restoredNanos) + + if restored.Deleted { + // The new meta key will be implicit and at restoredNanos. It needs to + // catch up on the GCBytesAge from that point on until orig.Timestamp + // (rule 2). ms.KeyBytes += restoredMetaKeySize ms.ValBytes += restoredMetaValSize - ms.KeyCount++ - if restored.Txn != nil { - panic("restored version should never be an intent") - } } - } - ms.AgeTo(orig.Timestamp.WallTime) + ms.AgeTo(orig.Timestamp.WallTime) - origTotalBytes := orig.KeyBytes + orig.ValBytes + origMetaKeySize + origMetaValSize - if sys { - ms.SysBytes -= origTotalBytes - ms.SysCount-- - } else { - if !orig.Deleted { - ms.LiveBytes -= origTotalBytes - ms.LiveCount-- + ms.KeyCount++ + + if !restored.Deleted { + // At orig.Timestamp, make the non-deletion version live again. + // Note that there's no need to explicitly age to the "present time" + // after. + ms.KeyBytes += restoredMetaKeySize + ms.ValBytes += restoredMetaValSize + + ms.LiveBytes += restored.KeyBytes + restored.ValBytes + ms.LiveCount++ + ms.LiveBytes += restoredMetaKeySize + restoredMetaValSize } - ms.KeyBytes -= (orig.KeyBytes + origMetaKeySize) - ms.ValBytes -= (orig.ValBytes + origMetaValSize) - ms.KeyCount-- - ms.ValCount-- - ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes) - ms.IntentCount-- + } else { + ms.AgeTo(orig.Timestamp.WallTime) + } + + if !orig.Deleted { + ms.LiveBytes -= (orig.KeyBytes + orig.ValBytes) + (origMetaKeySize + origMetaValSize) + ms.LiveCount-- } - ms.AgeTo(txnNanos) + ms.KeyBytes -= (orig.KeyBytes + origMetaKeySize) + ms.ValBytes -= (orig.ValBytes + origMetaValSize) + ms.KeyCount-- + ms.ValCount-- + ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes) + ms.IntentCount-- return ms } @@ -452,27 +569,32 @@ func updateStatsOnAbort( // value counts, and updating the GC'able bytes age. If meta is // not nil, then the value being GC'd is the mvcc metadata and we // decrement the key count. +// +// nonLiveMS is the timestamp at which the value became non-live. +// For a deletion tombstone this will be its own timestamp (rule two +// in updateStatsOnPut) and for a regular version it will be the closest +// newer version's (rule one). func updateStatsOnGC( - key roachpb.Key, keySize, valSize int64, meta *enginepb.MVCCMetadata, fromNS, toNS int64, + key roachpb.Key, keySize, valSize int64, meta *enginepb.MVCCMetadata, nonLiveMS int64, ) enginepb.MVCCStats { var ms enginepb.MVCCStats - ms.AgeTo(fromNS) - sys := isSysLocal(key) - if sys { + + if isSysLocal(key) { ms.SysBytes -= (keySize + valSize) if meta != nil { ms.SysCount-- } + return ms + } + + ms.AgeTo(nonLiveMS) + ms.KeyBytes -= keySize + ms.ValBytes -= valSize + if meta != nil { + ms.KeyCount-- } else { - ms.KeyBytes -= keySize - ms.ValBytes -= valSize - if meta != nil { - ms.KeyCount-- - } else { - ms.ValCount-- - } + ms.ValCount-- } - ms.AgeTo(toNS) return ms } @@ -1117,6 +1239,7 @@ func mvccPutInternal( var meta *enginepb.MVCCMetadata var maybeTooOldErr error + var prevValSize int64 if ok { // There is existing metadata for this key; ensure our write is permitted. meta = &buf.meta @@ -1149,9 +1272,24 @@ func mvccPutInternal( // writing at the same timestamp we can simply overwrite it; // otherwise we must explicitly delete the obsolete intent. if timestamp != metaTimestamp { + { + // If the older write intent has a version underneath it, we need to + // read its size because its GCBytesAge contribution may change as we + // move the intent above it. A similar phenomenon occurs in + // MVCCResolveWriteIntent. + latestKey := MVCCKey{Key: key, Timestamp: metaTimestamp} + _, prevVal, haveNextVersion, err := unsafeNextVersion(iter, latestKey) + if err != nil { + return err + } + if haveNextVersion { + prevValSize = int64(len(prevVal)) + } + } + versionKey := metaKey versionKey.Timestamp = metaTimestamp - if err = engine.Clear(versionKey); err != nil { + if err := engine.Clear(versionKey); err != nil { return err } } @@ -1237,7 +1375,7 @@ func mvccPutInternal( // Update MVCC stats. if ms != nil { - ms.Add(updateStatsOnPut(key, origMetaKeySize, origMetaValSize, + ms.Add(updateStatsOnPut(key, prevValSize, origMetaKeySize, origMetaValSize, metaKeySize, metaValSize, meta, newMeta)) } @@ -2166,19 +2304,38 @@ func mvccResolveWriteIntent( hlc.Timestamp(meta.Timestamp).Less(intent.Txn.Timestamp) && meta.Txn.Epoch >= intent.Txn.Epoch - // If we're committing, or if the commit timestamp of the intent has - // been moved forward, and if the proposed epoch matches the existing - // epoch: update the meta.Txn. For commit, it's set to nil; - // otherwise, we update its value. We may have to update the actual - // version value (remove old and create new with proper - // timestamp-encoded key) if timestamp changed. + // If we're committing, or if the commit timestamp of the intent has been moved forward, and if + // the proposed epoch matches the existing epoch: update the meta.Txn. For commit, it's set to + // nil; otherwise, we update its value. We may have to update the actual version value (remove old + // and create new with proper timestamp-encoded key) if timestamp changed. if commit || pushed { buf.newMeta = *meta // Set the timestamp for upcoming write (or at least the stats update). buf.newMeta.Timestamp = hlc.LegacyTimestamp(intent.Txn.Timestamp) + // If the new intent/value is at a different timestamp than the old intent, + // and there is a value under both, then that value may need an adjustment + // of its GCBytesAge. This is because it became non-live at orig.Timestamp + // originally, and now only becomes non-live at newMeta.Timestamp. For that + // reason, we have to read that version's size. + // + // However, if we're not actually moving the intent, no stats adjustment is + // necessary and we avoid reading the old version in that case. + var prevValSize int64 + if buf.newMeta.Timestamp != meta.Timestamp { + // Look for the first real versioned key, i.e. the key just below the (old) meta's + // timestamp. + latestKey := MVCCKey{Key: intent.Key, Timestamp: hlc.Timestamp(meta.Timestamp)} + _, unsafeNextValue, haveNextVersion, err := unsafeNextVersion(iter, latestKey) + if err != nil { + return false, err + } + if haveNextVersion { + prevValSize = int64(len(unsafeNextValue)) + } + } + var metaKeySize, metaValSize int64 - var err error if pushed { // Keep intent if we're pushing timestamp. buf.newTxn = intent.Txn @@ -2194,7 +2351,7 @@ func mvccResolveWriteIntent( // Update stat counters related to resolving the intent. if ms != nil { - ms.Add(updateStatsOnResolve(intent.Key, origMetaKeySize, origMetaValSize, + ms.Add(updateStatsOnResolve(intent.Key, prevValSize, origMetaKeySize, origMetaValSize, metaKeySize, metaValSize, *meta, buf.newMeta, commit)) } @@ -2256,7 +2413,7 @@ func mvccResolveWriteIntent( } // Clear stat counters attributable to the intent we're aborting. if ms != nil { - ms.Add(updateStatsOnAbort(intent.Key, origMetaKeySize, origMetaValSize, 0, 0, meta, nil, 0, intent.Txn.Timestamp.WallTime)) + ms.Add(updateStatsOnAbort(intent.Key, origMetaKeySize, origMetaValSize, 0, 0, meta, nil, 0)) } return true, nil } @@ -2278,8 +2435,7 @@ func mvccResolveWriteIntent( // Update stat counters with older version. if ms != nil { ms.Add(updateStatsOnAbort(intent.Key, origMetaKeySize, origMetaValSize, - metaKeySize, metaValSize, meta, &buf.newMeta, unsafeNextKey.Timestamp.WallTime, - intent.Txn.Timestamp.WallTime)) + metaKeySize, metaValSize, meta, &buf.newMeta, unsafeNextKey.Timestamp.WallTime)) } return true, nil @@ -2450,8 +2606,7 @@ func MVCCGarbageCollect( updateStatsForInline(ms, gcKey.Key, metaKeySize, metaValSize, 0, 0) ms.AgeTo(timestamp.WallTime) } else { - ms.Add(updateStatsOnGC(gcKey.Key, metaKeySize, metaValSize, - meta, meta.Timestamp.WallTime, timestamp.WallTime)) + ms.Add(updateStatsOnGC(gcKey.Key, metaKeySize, metaValSize, meta, meta.Timestamp.WallTime)) } } if !implicitMeta { @@ -2467,7 +2622,19 @@ func MVCCGarbageCollect( iter.Next() } + // TODO(tschottdorf): Can't we just Seek() to a key with timestamp + // gcKey.Timestamp to avoid potentially cycling through a large prefix + // of versions we can't GC? The batching mechanism in the GC queue sends + // requests susceptible to that happening when there are lots of versions. + // A minor complication there will be that we need to know the first non- + // deletable value's timestamp (for prevNanos). + // Now, iterate through all values, GC'ing ones which have expired. + // For GCBytesAge, this requires keeping track of the previous key's + // timestamp (prevNanos). See ComputeStatsGo for a more easily digested + // and better commented version of this logic. + + prevNanos := timestamp.WallTime for ; ; iter.Next() { if ok, err := iter.Valid(); err != nil { return err @@ -2483,15 +2650,26 @@ func MVCCGarbageCollect( } if !gcKey.Timestamp.Less(unsafeIterKey.Timestamp) { if ms != nil { + // FIXME: use prevNanos instead of unsafeIterKey.Timestamp, except + // when it's a deletion. + valSize := int64(len(iter.UnsafeValue())) + + // A non-deletion becomes non-live when its newer neighbor shows up. + // A deletion tombstone becomes non-live right when it is created. + fromNS := prevNanos + if valSize == 0 { + fromNS = unsafeIterKey.Timestamp.WallTime + } + ms.Add(updateStatsOnGC(gcKey.Key, mvccVersionTimestampSize, - int64(len(iter.UnsafeValue())), nil, unsafeIterKey.Timestamp.WallTime, - timestamp.WallTime)) + valSize, nil, fromNS)) } count++ if err := engine.Clear(unsafeIterKey); err != nil { return err } } + prevNanos = unsafeIterKey.Timestamp.WallTime } } @@ -2586,6 +2764,13 @@ func ComputeStatsGo( var prevKey []byte first := false + // Values start accruing GCBytesAge at the timestamp at which they + // are shadowed (i.e. overwritten) whereas deletion tombstones + // use their own timestamp. We're iterating through versions in + // reverse chronological order and use this variable to keep track + // of the point in time at which the current key begins to age. + var accrueGCAgeNanos int64 + iter.Seek(start) for ; ; iter.Next() { ok, err := iter.Valid() @@ -2680,9 +2865,20 @@ func ComputeStatsGo( if meta.ValBytes != int64(len(unsafeValue)) { return ms, errors.Errorf("expected mvcc metadata val bytes to equal %d; got %d", len(unsafeValue), meta.ValBytes) } + accrueGCAgeNanos = meta.Timestamp.WallTime } else { - // Overwritten value; add value bytes to the GC'able bytes age stat. - ms.GCBytesAge += totalBytes * (nowNanos/1E9 - unsafeKey.Timestamp.WallTime/1E9) + // Overwritten value. Is it a deletion tombstone? + isTombstone := len(unsafeValue) == 0 + if isTombstone { + // The contribution of the tombstone picks up GCByteAge from its own timestamp on. + ms.GCBytesAge += totalBytes * (nowNanos/1E9 - unsafeKey.Timestamp.WallTime/1E9) + } else { + // The kv pair is an overwritten value, so it became non-live when the closest more + // recent value was written. + ms.GCBytesAge += totalBytes * (nowNanos/1E9 - accrueGCAgeNanos/1E9) + } + // Update for the next version we may end up looking at. + accrueGCAgeNanos = unsafeKey.Timestamp.WallTime } ms.KeyBytes += mvccVersionTimestampSize ms.ValBytes += int64(len(unsafeValue)) diff --git a/pkg/storage/engine/mvcc_stats_test.go b/pkg/storage/engine/mvcc_stats_test.go index 4ca731e4b105..c9f18bafe0f4 100644 --- a/pkg/storage/engine/mvcc_stats_test.go +++ b/pkg/storage/engine/mvcc_stats_test.go @@ -39,7 +39,7 @@ import ( // assertEq compares the given ms and expMS and errors when they don't match. It // also recomputes the stats over the whole engine with all known // implementations and errors on mismatch with any of them. -func assertEq(t *testing.T, engine Engine, debug string, ms, expMS *enginepb.MVCCStats) { +func assertEq(t *testing.T, engine ReadWriter, debug string, ms, expMS *enginepb.MVCCStats) { t.Helper() msCpy := *ms // shallow copy @@ -47,7 +47,7 @@ func assertEq(t *testing.T, engine Engine, debug string, ms, expMS *enginepb.MVC ms.AgeTo(expMS.LastUpdateNanos) if !ms.Equal(expMS) { pretty.Ldiff(t, ms, expMS) - t.Errorf("%s: mismatch detected", debug) + t.Errorf("%s: diff(ms, expMS) nontrivial", debug) } it := engine.NewIterator(false) @@ -83,7 +83,7 @@ func TestMVCCStatsDeleteCommitMovesTimestamp(t *testing.T) { ts1 := hlc.Timestamp{WallTime: 1E9} // Put a value. value := roachpb.MakeValueFromString("value") - if err := MVCCPut(context.Background(), engine, aggMS, key, ts1, value, nil); err != nil { + if err := MVCCPut(ctx, engine, aggMS, key, ts1, value, nil); err != nil { t.Fatal(err) } @@ -91,8 +91,6 @@ func TestMVCCStatsDeleteCommitMovesTimestamp(t *testing.T) { vKeySize := mvccVersionTimestampSize // 12 vValSize := int64(len(value.RawBytes)) // 10 - log.Infof(ctx, "mKeySize=%d vKeySize=%d vValSize=%d", mKeySize, vKeySize, vValSize) - expMS := enginepb.MVCCStats{ LiveBytes: mKeySize + vKeySize + vValSize, // 24 LiveCount: 1, @@ -108,7 +106,7 @@ func TestMVCCStatsDeleteCommitMovesTimestamp(t *testing.T) { ts3 := hlc.Timestamp{WallTime: 3 * 1E9} txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts3}} txn.Timestamp.Forward(ts3) - if err := MVCCDelete(context.Background(), engine, aggMS, key, ts3, txn); err != nil { + if err := MVCCDelete(ctx, engine, aggMS, key, ts3, txn); err != nil { t.Fatal(err) } @@ -117,7 +115,7 @@ func TestMVCCStatsDeleteCommitMovesTimestamp(t *testing.T) { ts4 := hlc.Timestamp{WallTime: 4 * 1E9} txn.Status = roachpb.COMMITTED txn.Timestamp.Forward(ts4) - if err := MVCCResolveWriteIntent(context.Background(), engine, aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + if err := MVCCResolveWriteIntent(ctx, engine, aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { t.Fatal(err) } @@ -129,9 +127,10 @@ func TestMVCCStatsDeleteCommitMovesTimestamp(t *testing.T) { ValCount: 2, // The implicit meta record (deletion tombstone) counts for len("a")+1=2. // Two versioned keys count for 2*vKeySize. - KeyBytes: mKeySize + 2*vKeySize, - ValBytes: vValSize, // the initial write (10)... - GCBytesAge: (vValSize + vKeySize) * 3, // ... along with the value (12) aged over 3s, a total of 66 + KeyBytes: mKeySize + 2*vKeySize, + ValBytes: vValSize, // the initial write (10) + // No GCBytesAge has been accrued yet, as the value just got non-live at 4s. + GCBytesAge: 0, } assertEq(t, engine, "after committing", aggMS, &expAggMS) @@ -256,7 +255,9 @@ func TestMVCCStatsPutPushMovesTimestamp(t *testing.T) { // push as it would happen for a SNAPSHOT txn) ts4 := hlc.Timestamp{WallTime: 4 * 1E9} txn.Timestamp.Forward(ts4) - if err := MVCCResolveWriteIntent(ctx, engine, aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + if err := MVCCResolveWriteIntent( + ctx, engine, aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}, + ); err != nil { t.Fatal(err) } @@ -279,6 +280,489 @@ func TestMVCCStatsPutPushMovesTimestamp(t *testing.T) { assertEq(t, engine, "after pushing", aggMS, &expAggMS) } +// TestMVCCStatsDeleteMovesTimestamp is similar to TestMVCCStatsPutCommitMovesTimestamp: +// An intent is written and then re-written at a higher timestamp. This formerly messed up +// the GCBytesAge computation. +func TestMVCCStatsDeleteMovesTimestamp(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) + + ts1 := hlc.Timestamp{WallTime: 1E9} + ts2 := hlc.Timestamp{WallTime: 2 * 1E9} + + key := roachpb.Key("a") + txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts1}} + + // Write an intent. + value := roachpb.MakeValueFromString("value") + if err := MVCCPut(ctx, engine, aggMS, key, ts1, value, txn); err != nil { + t.Fatal(err) + } + + mKeySize := int64(mvccKey(key).EncodedSize()) + require.EqualValues(t, mKeySize, 2) + + mVal1Size := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts1), + Deleted: false, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, mVal1Size, 44) + + m1ValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts2), + Deleted: false, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, m1ValSize, 44) + + vKeySize := mvccVersionTimestampSize + require.EqualValues(t, vKeySize, 12) + + vValSize := int64(len(value.RawBytes)) + require.EqualValues(t, vValSize, 10) + + expMS := enginepb.MVCCStats{ + LastUpdateNanos: 1E9, + LiveBytes: mKeySize + m1ValSize + vKeySize + vValSize, // 2+44+12+10 = 68 + LiveCount: 1, + KeyBytes: mKeySize + vKeySize, // 2+12 = 14 + KeyCount: 1, + ValBytes: mVal1Size + vValSize, // 44+10 = 54 + ValCount: 1, + IntentAge: 0, + IntentCount: 1, + IntentBytes: vKeySize + vValSize, // 12+10 = 22 + } + assertEq(t, engine, "after put", aggMS, &expMS) + + // Now replace our intent with a deletion intent, but with a timestamp gap. + // This could happen if a transaction got restarted with a higher timestamp + // and ran logic different from that in the first attempt. + txn.Timestamp.Forward(ts2) + + txn.Sequence++ + + // Annoyingly, the new meta value is actually a little larger thanks to the + // sequence number. + m2ValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts2), + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, m2ValSize, 46) + + if err := MVCCDelete(ctx, engine, aggMS, key, ts2, txn); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + LiveBytes: 0, + LiveCount: 0, + KeyCount: 1, + ValCount: 1, + // The explicit meta record counts for len("a")+1=2. + // One versioned key counts for vKeySize. + KeyBytes: mKeySize + vKeySize, + // The intent is still there, but this time with mVal2Size, and a zero vValSize. + ValBytes: m2ValSize, // 10+46 = 56 + IntentAge: 0, + IntentCount: 1, // still there + IntentBytes: vKeySize, // still there, but now without vValSize + GCBytesAge: 0, // this was once erroneously negative + } + + assertEq(t, engine, "after deleting", aggMS, &expAggMS) +} + +// TestMVCCStatsPutMovesDeletionTimestamp is similar to TestMVCCStatsPutCommitMovesTimestamp: A +// tombstone intent is written and then replaced by a value intent at a higher timestamp. This +// formerly messed up the GCBytesAge computation. +func TestMVCCStatsPutMovesDeletionTimestamp(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) + + ts1 := hlc.Timestamp{WallTime: 1E9} + ts2 := hlc.Timestamp{WallTime: 2 * 1E9} + + key := roachpb.Key("a") + txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts1}} + + // Write a deletion tombstone intent. + if err := MVCCDelete(ctx, engine, aggMS, key, ts1, txn); err != nil { + t.Fatal(err) + } + + value := roachpb.MakeValueFromString("value") + + mKeySize := int64(mvccKey(key).EncodedSize()) + require.EqualValues(t, mKeySize, 2) + + mVal1Size := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts1), + Deleted: false, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, mVal1Size, 44) + + m1ValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts2), + Deleted: false, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, m1ValSize, 44) + + vKeySize := mvccVersionTimestampSize + require.EqualValues(t, vKeySize, 12) + + vValSize := int64(len(value.RawBytes)) + require.EqualValues(t, vValSize, 10) + + expMS := enginepb.MVCCStats{ + LastUpdateNanos: 1E9, + LiveBytes: 0, + LiveCount: 0, + KeyBytes: mKeySize + vKeySize, // 2 + 12 = 24 + KeyCount: 1, + ValBytes: mVal1Size, // 44 + ValCount: 1, + IntentAge: 0, + IntentCount: 1, + IntentBytes: vKeySize, // 12 + GCBytesAge: 0, + } + assertEq(t, engine, "after delete", aggMS, &expMS) + + // Now replace our deletion with a value intent, but with a timestamp gap. + // This could happen if a transaction got restarted with a higher timestamp + // and ran logic different from that in the first attempt. + txn.Timestamp.Forward(ts2) + + txn.Sequence++ + + // Annoyingly, the new meta value is actually a little larger thanks to the + // sequence number. + m2ValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts2), + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, m2ValSize, 46) + + if err := MVCCPut(ctx, engine, aggMS, key, ts2, value, txn); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + LiveBytes: mKeySize + m2ValSize + vKeySize + vValSize, // 2+46+12+10 = 70 + LiveCount: 1, + KeyCount: 1, + ValCount: 1, + // The explicit meta record counts for len("a")+1=2. + // One versioned key counts for vKeySize. + KeyBytes: mKeySize + vKeySize, + // The intent is still there, but this time with mVal2Size, and a zero vValSize. + ValBytes: vValSize + m2ValSize, // 10+46 = 56 + IntentAge: 0, + IntentCount: 1, // still there + IntentBytes: vKeySize + vValSize, // still there, now bigger + GCBytesAge: 0, // this was once erroneously negative + } + + assertEq(t, engine, "after put", aggMS, &expAggMS) +} + +// TestMVCCStatsDelDelCommit writes a non-transactional tombstone, and then adds an intent tombstone +// on top that is then committed or aborted at a higher timestamp. This random-looking sequence was +// the smallest failing example that highlighted a stats computation error once, and exercises a +// code path in which MVCCResolveIntent has to read the pre-intent value in order to arrive at the +// correct stats. +func TestMVCCStatsDelDelCommitMovesTimestamp(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) + + key := roachpb.Key("a") + + ts1 := hlc.Timestamp{WallTime: 1E9} + ts2 := hlc.Timestamp{WallTime: 2E9} + ts3 := hlc.Timestamp{WallTime: 3E9} + + // Write a non-transactional tombstone at t=1s. + if err := MVCCDelete(ctx, engine, aggMS, key, ts1, nil /* txn */); err != nil { + t.Fatal(err) + } + + mKeySize := int64(mvccKey(key).EncodedSize()) + require.EqualValues(t, mKeySize, 2) + vKeySize := mvccVersionTimestampSize + require.EqualValues(t, vKeySize, 12) + + expMS := enginepb.MVCCStats{ + LastUpdateNanos: 1E9, + KeyBytes: mKeySize + vKeySize, + KeyCount: 1, + ValBytes: 0, + ValCount: 1, + } + + assertEq(t, engine, "after non-transactional delete", aggMS, &expMS) + + // Write an tombstone intent at t=2s (anchored at ts=1s, just for fun). + txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts1}} + if err := MVCCDelete(ctx, engine, aggMS, key, ts2, txn); err != nil { + t.Fatal(err) + } + + mValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts1), + Deleted: true, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, mValSize, 44) + + expMS = enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + KeyBytes: mKeySize + 2*vKeySize, // 2+2*12 = 26 + KeyCount: 1, + ValBytes: mValSize, // 44 + ValCount: 2, + IntentCount: 1, + IntentBytes: vKeySize, // TBD + // The original non-transactional write (at 1s) has now aged one second. + GCBytesAge: 1 * vKeySize, + } + assertEq(t, engine, "after put", aggMS, &expMS) + + // Now commit or abort the intent, respectively, but with a timestamp gap + // (i.e. this is a push-commit as it would happen for a SNAPSHOT txn). + t.Run("Commit", func(t *testing.T) { + aggMS := *aggMS + engine := engine.NewBatch() + defer engine.Close() + txn := txn.Clone() + + txn.Status = roachpb.COMMITTED + txn.Timestamp.Forward(ts3) + if err := MVCCResolveWriteIntent(ctx, engine, &aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 3E9, + KeyBytes: mKeySize + 2*vKeySize, // 2+2*12 = 26 + KeyCount: 1, + ValBytes: 0, + ValCount: 2, + IntentCount: 0, + IntentBytes: 0, + // The very first write picks up another second of age. Before a bug fix, + // this was failing to do so. + GCBytesAge: 2 * vKeySize, + } + + assertEq(t, engine, "after committing", &aggMS, &expAggMS) + }) + t.Run("Abort", func(t *testing.T) { + aggMS := *aggMS + engine := engine.NewBatch() + defer engine.Close() + + txn := txn.Clone() + + txn.Status = roachpb.ABORTED + txn.Timestamp.Forward(ts3) + if err := MVCCResolveWriteIntent(ctx, engine, &aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 3E9, + KeyBytes: mKeySize + vKeySize, // 2+12 = 14 + KeyCount: 1, + ValBytes: 0, + ValCount: 1, + IntentCount: 0, + IntentBytes: 0, + // We aborted our intent, but the value we first wrote was a tombstone, and + // so it's expected to retain its age. Since it's now the only value, it + // also contributes as a meta key. + GCBytesAge: 2 * (mKeySize + vKeySize), + } + + assertEq(t, engine, "after aborting", &aggMS, &expAggMS) + }) +} + +// TestMVCCStatsPutDelPut is similar to TestMVCCStatsDelDelCommit, but its first +// non-transactional write is not a tombstone but a real value. This exercises a +// different code path as a tombstone starts accruing GCBytesAge at its own timestamp, +// but values only when the next value is written, which makes the computation tricky +// when that next value is an intent that changes its timestamp before committing. +// Finishing the sequence with a Put in particular exercises a case in which the +// final correction is done in the put path and not the commit path. +func TestMVCCStatsPutDelPutMovesTimestamp(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) + + key := roachpb.Key("a") + + ts1 := hlc.Timestamp{WallTime: 1E9} + ts2 := hlc.Timestamp{WallTime: 2E9} + ts3 := hlc.Timestamp{WallTime: 3E9} + + // Write a non-transactional value at t=1s. + value := roachpb.MakeValueFromString("value") + if err := MVCCPut(ctx, engine, aggMS, key, ts1, value, nil /* txn */); err != nil { + t.Fatal(err) + } + + mKeySize := int64(mvccKey(key).EncodedSize()) + require.EqualValues(t, mKeySize, 2) + + vKeySize := mvccVersionTimestampSize + require.EqualValues(t, vKeySize, 12) + + vValSize := int64(len(value.RawBytes)) + require.EqualValues(t, vValSize, 10) + + expMS := enginepb.MVCCStats{ + LastUpdateNanos: 1E9, + KeyBytes: mKeySize + vKeySize, + KeyCount: 1, + ValBytes: vValSize, + ValCount: 1, + LiveBytes: mKeySize + vKeySize + vValSize, + LiveCount: 1, + } + + assertEq(t, engine, "after non-transactional put", aggMS, &expMS) + + // Write a tombstone intent at t=2s (anchored at ts=1s, just for fun). + txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: ts1}} + if err := MVCCDelete(ctx, engine, aggMS, key, ts2, txn); err != nil { + t.Fatal(err) + } + + mValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts1), + Deleted: true, + Txn: &txn.TxnMeta, + }).Size()) + require.EqualValues(t, mValSize, 44) + + expMS = enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + KeyBytes: mKeySize + 2*vKeySize, // 2+2*12 = 26 + KeyCount: 1, + ValBytes: mValSize + vValSize, // 44+10 = 56 + ValCount: 2, + IntentCount: 1, + IntentBytes: vKeySize, // 12 + // The original non-transactional write becomes non-live at 2s, so no age + // is accrued yet. + GCBytesAge: 0, + } + assertEq(t, engine, "after txn delete", aggMS, &expMS) + + // Now commit or abort the intent, but with a timestamp gap (i.e. this is a push-commit as it + // would happen for a SNAPSHOT txn) + + txn.Timestamp.Forward(ts3) + txn.Sequence++ + + // Annoyingly, the new meta value is actually a little larger thanks to the + // sequence number. + m2ValSize := int64((&enginepb.MVCCMetadata{ + Timestamp: hlc.LegacyTimestamp(ts3), + Txn: &txn.TxnMeta, + }).Size()) + + require.EqualValues(t, m2ValSize, 46) + + t.Run("Abort", func(t *testing.T) { + aggMS := *aggMS + engine := engine.NewBatch() + defer engine.Close() + txn := txn.Clone() + + txn.Status = roachpb.ABORTED // doesn't change m2ValSize, fortunately + if err := MVCCResolveWriteIntent(ctx, engine, &aggMS, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 3E9, + KeyBytes: mKeySize + vKeySize, + KeyCount: 1, + ValBytes: vValSize, + ValCount: 1, + LiveCount: 1, + LiveBytes: mKeySize + vKeySize + vValSize, + IntentCount: 0, + IntentBytes: 0, + // The original value is visible again, so no GCBytesAge is present. Verifying this is the + // main point of this test (to prevent regression of a bug). + GCBytesAge: 0, + } + assertEq(t, engine, "after abort", &aggMS, &expAggMS) + }) + t.Run("Put", func(t *testing.T) { + aggMS := *aggMS + engine := engine.NewBatch() + defer engine.Close() + + value2 := roachpb.MakeValueFromString("longvalue") + vVal2Size := int64(len(value2.RawBytes)) + require.EqualValues(t, vVal2Size, 14) + + if err := MVCCPut(ctx, engine, &aggMS, key, ts3, value2, txn); err != nil { + t.Fatal(err) + } + + expAggMS := enginepb.MVCCStats{ + LastUpdateNanos: 3E9, + KeyBytes: mKeySize + 2*vKeySize, // 2+2*12 = 26 + KeyCount: 1, + ValBytes: m2ValSize + vValSize + vVal2Size, + ValCount: 2, + LiveCount: 1, + LiveBytes: mKeySize + m2ValSize + vKeySize + vVal2Size, + IntentCount: 1, + IntentBytes: vKeySize + vVal2Size, + // The original write was previously non-live at 2s because that's where the + // intent originally lived. But the intent has moved to 3s, and so has the + // moment in time at which the shadowed put became non-live; it's now 3s as + // well, so there's no contribution yet. + GCBytesAge: 0, + } + assertEq(t, engine, "after txn put", &aggMS, &expAggMS) + }) +} + // TestMVCCStatsDelDelGC prevents regression of a bug in MVCCGarbageCollect // that was exercised by running two deletions followed by a specific GC. func TestMVCCStatsDelDelGC(t *testing.T) { @@ -423,9 +907,94 @@ func TestMVCCStatsPutIntentTimestampNotPutTimestamp(t *testing.T) { assertEq(t, engine, "after second put", aggMS, &expAggMS) } +// TestMVCCStatsPutWaitDeleteGC puts a value, deletes it, and runs a GC that +// deletes the original write, but not the deletion tombstone. +func TestMVCCStatsPutWaitDeleteGC(t *testing.T) { + defer leaktest.AfterTest(t)() + engine := createTestEngine() + defer engine.Close() + + ctx := context.Background() + aggMS := &enginepb.MVCCStats{} + + assertEq(t, engine, "initially", aggMS, &enginepb.MVCCStats{}) + + key := roachpb.Key("a") + + ts1 := hlc.Timestamp{WallTime: 1E9} + ts2 := hlc.Timestamp{WallTime: 2E9} + + // Write a value at ts1. + value1 := roachpb.MakeValueFromString("value") + if err := MVCCPut(ctx, engine, aggMS, key, ts1, value1, nil /* txn */); err != nil { + t.Fatal(err) + } + + mKeySize := int64(mvccKey(key).EncodedSize()) + require.EqualValues(t, mKeySize, 2) + + vKeySize := mvccVersionTimestampSize + require.EqualValues(t, vKeySize, 12) + + vValSize := int64(len(value1.RawBytes)) + require.EqualValues(t, vValSize, 10) + + expMS := enginepb.MVCCStats{ + LastUpdateNanos: 1E9, + KeyCount: 1, + KeyBytes: mKeySize + vKeySize, // 2+12 = 14 + ValCount: 1, + ValBytes: vValSize, // 10 + LiveCount: 1, + LiveBytes: mKeySize + vKeySize + vValSize, // 2+12+10 = 24 + } + assertEq(t, engine, "after first put", aggMS, &expMS) + + // Delete the value at ts5. + + if err := MVCCDelete(ctx, engine, aggMS, key, ts2, nil /* txn */); err != nil { + t.Fatal(err) + } + + expMS = enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + KeyCount: 1, + KeyBytes: mKeySize + 2*vKeySize, // 2+2*12 = 26 + ValBytes: vValSize, // 10 + ValCount: 2, + LiveBytes: 0, + LiveCount: 0, + GCBytesAge: 0, // before a fix, this was vKeySize + vValSize + } + + assertEq(t, engine, "after delete", aggMS, &expMS) + + if err := MVCCGarbageCollect(ctx, engine, aggMS, []roachpb.GCRequest_GCKey{{ + Key: key, + Timestamp: ts1, + }}, ts2); err != nil { + t.Fatal(err) + } + + expMS = enginepb.MVCCStats{ + LastUpdateNanos: 2E9, + KeyCount: 1, + KeyBytes: mKeySize + vKeySize, // 2+12 = 14 + ValBytes: 0, + ValCount: 1, + LiveBytes: 0, + LiveCount: 0, + GCBytesAge: 0, // before a fix, this was vKeySize + vValSize + } + + assertEq(t, engine, "after GC", aggMS, &expMS) +} + // TestMVCCStatsDocumentNegativeWrites documents that things go wrong when you // write at a negative timestamp. We shouldn't do that in practice and perhaps // we should have it error outright. +// +// See https://github.com/cockroachdb/cockroach/issues/21112. func TestMVCCStatsDocumentNegativeWrites(t *testing.T) { defer leaktest.AfterTest(t)() engine := createTestEngine() @@ -622,226 +1191,6 @@ func TestMVCCStatsSysPutPut(t *testing.T) { assertEq(t, engine, "after second put", aggMS, &expMS) } -// TestMVCCStatsBasic writes a value, then deletes it as an intent via a -// transaction, then resolves the intent, manually verifying the mvcc stats at -// each step. -func TestMVCCStatsBasic(t *testing.T) { - defer leaktest.AfterTest(t)() - engine := createTestEngine() - defer engine.Close() - - ms := &enginepb.MVCCStats{} - - assertEq(t, engine, "initially", ms, &enginepb.MVCCStats{}) - - // Verify size of mvccVersionTimestampSize. - ts := hlc.Timestamp{WallTime: 1 * 1E9} - key := roachpb.Key("a") - keySize := int64(mvccVersionKey(key, ts).EncodedSize() - mvccKey(key).EncodedSize()) - if keySize != mvccVersionTimestampSize { - t.Errorf("expected version timestamp size %d; got %d", mvccVersionTimestampSize, keySize) - } - - // Put a value. - value := roachpb.MakeValueFromString("value") - if err := MVCCPut(context.Background(), engine, ms, key, ts, value, nil); err != nil { - t.Fatal(err) - } - mKeySize := int64(mvccKey(key).EncodedSize()) - vKeySize := mvccVersionTimestampSize - vValSize := int64(len(value.RawBytes)) - - expMS := enginepb.MVCCStats{ - LiveBytes: mKeySize + vKeySize + vValSize, - LiveCount: 1, - KeyBytes: mKeySize + vKeySize, - KeyCount: 1, - ValBytes: vValSize, - ValCount: 1, - LastUpdateNanos: 1E9, - } - assertEq(t, engine, "after put", ms, &expMS) - if e, a := int64(0), ms.GCBytes(); e != a { - t.Fatalf("GCBytes: expected %d, got %d", e, a) - } - - // Delete the value using a transaction. - // TODO(tschottdorf): this case is interesting: we write at ts2, but the timestamp is ts1. - // Need to check whether that's reasonable, and if so, test it more. - txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: hlc.Timestamp{WallTime: 1 * 1E9}}} - ts2 := hlc.Timestamp{WallTime: 2 * 1E9} - if err := MVCCDelete(context.Background(), engine, ms, key, ts2, txn); err != nil { - t.Fatal(err) - } - m2ValSize := int64((&enginepb.MVCCMetadata{ - Timestamp: hlc.LegacyTimestamp(ts2), - Deleted: true, - Txn: &txn.TxnMeta, - }).Size()) - v2KeySize := mvccVersionTimestampSize - v2ValSize := int64(0) - - expMS2 := enginepb.MVCCStats{ - KeyBytes: mKeySize + vKeySize + v2KeySize, - KeyCount: 1, - ValBytes: m2ValSize + vValSize + v2ValSize, - ValCount: 2, - IntentBytes: v2KeySize + v2ValSize, - IntentCount: 1, - IntentAge: 0, - GCBytesAge: vValSize + vKeySize, // immediately recognizes GC'able bytes from old value at 1E9 - LastUpdateNanos: 2E9, - } - assertEq(t, engine, "after delete", ms, &expMS2) - // This is expMS2.KeyBytes + expMS2.ValBytes - expMS2.LiveBytes - expGC2 := mKeySize + vKeySize + v2KeySize + m2ValSize + vValSize + v2ValSize - if a := ms.GCBytes(); expGC2 != a { - t.Fatalf("GCBytes: expected %d, got %d", expGC2, a) - } - - // Resolve the deletion by aborting it. - txn.Status = roachpb.ABORTED - txn.Timestamp.Forward(ts2) - if err := MVCCResolveWriteIntent(context.Background(), engine, ms, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { - t.Fatal(err) - } - // Stats should equal same as before the deletion after aborting the intent. - expMS.LastUpdateNanos = 2E9 - assertEq(t, engine, "after abort", ms, &expMS) - - // Re-delete, but this time, we're going to commit it. - txn.Status = roachpb.PENDING - ts3 := hlc.Timestamp{WallTime: 3 * 1E9} - txn.Timestamp.Forward(ts3) - if err := MVCCDelete(context.Background(), engine, ms, key, ts3, txn); err != nil { - t.Fatal(err) - } - // GCBytesAge will now count the deleted value from ts=1E9 to ts=3E9. - expMS2.GCBytesAge = (vValSize + vKeySize) * 2 - expMS2.LastUpdateNanos = 3E9 - assertEq(t, engine, "after 2nd delete", ms, &expMS2) // should be same as before. - if a := ms.GCBytes(); expGC2 != a { - t.Fatalf("GCBytes: expected %d, got %d", expGC2, a) - } - - // Write a second transactional value (i.e. an intent). - ts4 := hlc.Timestamp{WallTime: 4 * 1E9} - txn.Timestamp = ts4 - key2 := roachpb.Key("b") - value2 := roachpb.MakeValueFromString("value") - if err := MVCCPut(context.Background(), engine, ms, key2, ts4, value2, txn); err != nil { - t.Fatal(err) - } - mKey2Size := int64(mvccKey(key2).EncodedSize()) - mVal2Size := int64((&enginepb.MVCCMetadata{ - Timestamp: hlc.LegacyTimestamp(ts4), - Txn: &txn.TxnMeta, - }).Size()) - vKey2Size := mvccVersionTimestampSize - vVal2Size := int64(len(value2.RawBytes)) - expMS3 := enginepb.MVCCStats{ - KeyBytes: mKeySize + vKeySize + v2KeySize + mKey2Size + vKey2Size, - KeyCount: 2, - ValBytes: m2ValSize + vValSize + v2ValSize + mVal2Size + vVal2Size, - ValCount: 3, - LiveBytes: mKey2Size + vKey2Size + mVal2Size + vVal2Size, - LiveCount: 1, - IntentBytes: v2KeySize + v2ValSize + vKey2Size + vVal2Size, - IntentCount: 2, - IntentAge: 1, - // It gets interesting: The first term is the contribution from the - // deletion of the first put (written at 1s, deleted at 3s). From 3s - // to 4s, on top of that we age the intent's meta entry plus deletion - // tombstone on top of that (expGC2). - GCBytesAge: (vValSize+vKeySize)*2 + expGC2, - LastUpdateNanos: 4E9, - } - - expGC3 := expGC2 // no change, didn't delete anything - assertEq(t, engine, "after 2nd put", ms, &expMS3) - if a := ms.GCBytes(); expGC3 != a { - t.Fatalf("GCBytes: expected %d, got %d", expGC3, a) - } - - // Now commit both values. - txn.Status = roachpb.COMMITTED - if err := MVCCResolveWriteIntent(context.Background(), engine, ms, roachpb.Intent{Span: roachpb.Span{Key: key}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { - t.Fatal(err) - } - expMS4 := enginepb.MVCCStats{ - KeyBytes: mKeySize + vKeySize + v2KeySize + mKey2Size + vKey2Size, - KeyCount: 2, - ValBytes: vValSize + v2ValSize + mVal2Size + vVal2Size, - ValCount: 3, - LiveBytes: mKey2Size + vKey2Size + mVal2Size + vVal2Size, - LiveCount: 1, - IntentBytes: vKey2Size + vVal2Size, - IntentCount: 1, - // The commit turned the explicit deletion intent meta back into an - // implicit one, so we see the originally written value which is now 3s - // old. There is no contribution from the deletion tombstone yet as it - // moved from 3s (inside the meta) to 4s (implicit meta), the current - // time. - GCBytesAge: (vValSize + vKeySize) * 3, - LastUpdateNanos: 4E9, - } - assertEq(t, engine, "after first commit", ms, &expMS4) - - // With commit of the deletion intent, what really happens is that the - // explicit meta (carrying the intent) becomes implicit (so its key - // gets counted in the same way by convention, but its value is now empty). - expGC4 := expGC3 - m2ValSize - if a := ms.GCBytes(); expGC4 != a { - t.Fatalf("GCBytes: expected %d, got %d", expGC4, a) - } - - if err := MVCCResolveWriteIntent(context.Background(), engine, ms, roachpb.Intent{Span: roachpb.Span{Key: key2}, Status: txn.Status, Txn: txn.TxnMeta}); err != nil { - t.Fatal(err) - } - expMS4 = enginepb.MVCCStats{ - KeyBytes: mKeySize + vKeySize + v2KeySize + mKey2Size + vKey2Size, - KeyCount: 2, - ValBytes: vValSize + v2ValSize + vVal2Size, - ValCount: 3, - LiveBytes: mKey2Size + vKey2Size + vVal2Size, - LiveCount: 1, - IntentAge: 0, - GCBytesAge: (vValSize + vKeySize) * 3, // unchanged; still at 4s - LastUpdateNanos: 4E9, - } - assertEq(t, engine, "after second commit", ms, &expMS4) - if a := ms.GCBytes(); expGC4 != a { // no change here - t.Fatalf("GCBytes: expected %d, got %d", expGC4, a) - } - - // Write over existing value to create GC'able bytes. - ts5 := hlc.Timestamp{WallTime: 10 * 1E9} // skip ahead 6s - if err := MVCCPut(context.Background(), engine, ms, key2, ts5, value2, nil); err != nil { - t.Fatal(err) - } - expMS5 := expMS4 - expMS5.KeyBytes += vKey2Size - expMS5.ValBytes += vVal2Size - expMS5.ValCount = 4 - // The age increases: 6 seconds for each key2 and key. - expMS5.GCBytesAge += (vKey2Size+vVal2Size)*6 + expGC4*6 - expMS5.LastUpdateNanos = 10E9 - assertEq(t, engine, "after overwrite", ms, &expMS5) - - // Write a transaction record which is a system-local key. - txnKey := keys.TransactionKey(txn.Key, txn.ID) - txnVal := roachpb.MakeValueFromString("txn-data") - if err := MVCCPut(context.Background(), engine, ms, txnKey, hlc.Timestamp{}, txnVal, nil); err != nil { - t.Fatal(err) - } - txnKeySize := int64(mvccKey(txnKey).EncodedSize()) - txnValSize := int64((&enginepb.MVCCMetadata{RawBytes: txnVal.RawBytes}).Size()) - expMS6 := expMS5 - expMS6.SysBytes += txnKeySize + txnValSize - expMS6.SysCount++ - assertEq(t, engine, "after sys-local key", ms, &expMS6) -} - var mvccStatsTests = []struct { name string fn func(Iterator, MVCCKey, MVCCKey, int64) (enginepb.MVCCStats, error) @@ -962,6 +1311,10 @@ func TestMVCCStatsRandomized(t *testing.T) { ctx := context.Background() + // NB: no failure type ever required count five or more. When there is a result + // found by this test, or any other MVCC code is changed, it's worth reducing + // this first to two, three, ... and running the test for a minute to get a + // good idea of minimally reproducing examples. const count = 200 actions := make(map[string]func(*state) string) @@ -1050,7 +1403,7 @@ func TestMVCCStatsRandomized(t *testing.T) { ); err != nil { return err.Error() } - return fmt.Sprintf("%s (count %d)", gcTS, count) + return fmt.Sprint(gcTS) } for _, test := range []struct { diff --git a/pkg/storage/gc_queue_test.go b/pkg/storage/gc_queue_test.go index 8a3e05e59892..fd3a248476dd 100644 --- a/pkg/storage/gc_queue_test.go +++ b/pkg/storage/gc_queue_test.go @@ -159,10 +159,10 @@ func newCachedWriteSimulator(t *testing.T) *cachedWriteSimulator { {enginepb.MVCCStats{LastUpdateNanos: 946684800000000000}, "1-1m0s-1.0 MiB"}: { first: [cacheFirstLen]enginepb.MVCCStats{ {ContainsEstimates: false, LastUpdateNanos: 946684800000000000, IntentAge: 0, GCBytesAge: 0, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 23, KeyCount: 1, ValBytes: 1048581, ValCount: 1, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, - {ContainsEstimates: false, LastUpdateNanos: 946684801000000000, IntentAge: 0, GCBytesAge: 1048593, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 35, KeyCount: 1, ValBytes: 2097162, ValCount: 2, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, - {ContainsEstimates: false, LastUpdateNanos: 946684802000000000, IntentAge: 0, GCBytesAge: 3145779, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 47, KeyCount: 1, ValBytes: 3145743, ValCount: 3, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, + {ContainsEstimates: false, LastUpdateNanos: 946684801000000000, IntentAge: 0, GCBytesAge: 0, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 35, KeyCount: 1, ValBytes: 2097162, ValCount: 2, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, + {ContainsEstimates: false, LastUpdateNanos: 946684802000000000, IntentAge: 0, GCBytesAge: 1048593, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 47, KeyCount: 1, ValBytes: 3145743, ValCount: 3, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, }, - last: enginepb.MVCCStats{ContainsEstimates: false, LastUpdateNanos: 946684860000000000, IntentAge: 0, GCBytesAge: 1918925190, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 743, KeyCount: 1, ValBytes: 63963441, ValCount: 61, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, + last: enginepb.MVCCStats{ContainsEstimates: false, LastUpdateNanos: 946684860000000000, IntentAge: 0, GCBytesAge: 1856009610, LiveBytes: 1048604, LiveCount: 1, KeyBytes: 743, KeyCount: 1, ValBytes: 63963441, ValCount: 61, IntentBytes: 0, IntentCount: 0, SysBytes: 0, SysCount: 0}, }, } return &cws @@ -286,22 +286,22 @@ func TestGCQueueMakeGCScoreRealistic(t *testing.T) { // // Since at the time of this check the data is already 30s old on // average (i.e. ~30x the TTL), we expect to *really* want GC. - cws.shouldQueue(true, 29.92, time.Duration(0), 0, ms) - cws.shouldQueue(true, 29.92, time.Duration(0), 0, ms) + cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms) + cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms) // Right after we finished writing, we don't want to GC yet with a one-minute TTL. - cws.shouldQueue(false, 0.50, time.Duration(0), minuteTTL, ms) + cws.shouldQueue(false, 0.48, time.Duration(0), minuteTTL, ms) // Neither after a minute. The first values are about to become GC'able, though. - cws.shouldQueue(false, 1.48, time.Minute, minuteTTL, ms) + cws.shouldQueue(false, 1.46, time.Minute, minuteTTL, ms) // 90 seconds in it's really close, but still just shy of GC. Half of the // values could be deleted now (remember that we wrote them over a one // minute period). - cws.shouldQueue(false, 1.97, 3*time.Minute/2, minuteTTL, ms) + cws.shouldQueue(false, 1.95, 3*time.Minute/2, minuteTTL, ms) // Advancing another 1/4 minute does the trick. - cws.shouldQueue(true, 2.22, 7*time.Minute/4, minuteTTL, ms) + cws.shouldQueue(true, 2.20, 7*time.Minute/4, minuteTTL, ms) // After an hour, that's (of course) still true with a very high priority. - cws.shouldQueue(true, 59.35, time.Hour, minuteTTL, ms) + cws.shouldQueue(true, 59.34, time.Hour, minuteTTL, ms) // Let's see what the same would look like with a 1h TTL. // Can't delete anything until 59min have passed, and indeed the score is low.