From af351fa5dd4ded4ec185c97476d6e2353bf7cc58 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 9 Sep 2019 11:45:48 -0400 Subject: [PATCH] storage/engine: return WriteIntentError for intents in uncertainty intervals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the most common failure case in all of the following Jepsen failures. I'm not closing them here though, because they also should be failing due to #36431. Would fix #37394. Would fix #37545. Would fix #37930. Would fix #37932. Would fix #37956. Would fix #38126. Would fix #38763. Before this fix, we were not considering intents in a scan's uncertainty interval to be uncertain. This had the potential to cause stale reads because an unresolved intent doesn't indicate that its transaction hasn’t been committed and isn’t a causal ancestor of the scan. This was causing the `jepsen/multi-register` tests to fail, which I had previously incorrectly attributed entirely to #36431. This commit fixes this by returning `WriteIntentError`s for intents when they are above the read timestamp of a scan but below the max timestamp of a scan. This could have also been fixed by returning `ReadWithinUncertaintyIntervalError`s in this situation. Both would eventually have the same effect, but it seems preferable to kick off concurrency control immediately in this situation and only fall back to uncertainty handling for committed values. If the intent ends up being aborted, this could allow the read to avoid moving its timestamp. This commit will need to be backported all the way back to v2.0. Release note (bug fix): Consider intents in a read's uncertainty interval to be uncertain just as if they were committed values. This removes the potential for stale reads when a causally dependent transaction runs into the not-yet resolved intents from a causal ancestor. --- c-deps/libroach/mvcc.h | 7 +- pkg/storage/engine/mvcc.go | 32 ++++-- pkg/storage/engine/mvcc_test.go | 174 ++++++++++++++++++++++++++------ 3 files changed, 171 insertions(+), 42 deletions(-) diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index 57043b0e7df4..46f5ea9b4f1d 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -261,7 +261,12 @@ template class mvccScanner { // values); this timestamp is prev_timestamp. const DBTimestamp prev_timestamp = timestamp_ < meta_timestamp ? timestamp_ : PrevTimestamp(meta_timestamp); - if (timestamp_ < meta_timestamp && !own_intent) { + // If we don't consider the txn's uncertainty interval then intents are only + // visible to it at or below the txn's read timestamp. However, if the txn's + // read timestamp is less than the max timestamp seen by the txn then intents + // are visible to the txn all the way up to its max timestamp. + const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_; + if (max_visible_timestamp < meta_timestamp && !own_intent) { // 5. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index 0b026228f025..d822cbd1b456 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -867,18 +867,34 @@ func mvccGetInternal( timestamp = metaTimestamp.Prev() } - ownIntent := IsIntentOf(meta, txn) // false if txn == nil - if !timestamp.Less(metaTimestamp) && meta.Txn != nil && !ownIntent { + checkUncertainty := txn != nil && timestamp.Less(txn.MaxTimestamp) + isIntent := meta.Txn != nil + ownIntent := IsIntentOf(meta, txn) // false if !isIntent + if isIntent && !ownIntent { // Trying to read the last value, but it's another transaction's intent; - // the reader will have to act on this. - return nil, nil, safeValue, &roachpb.WriteIntentError{ - Intents: []roachpb.Intent{{Span: roachpb.Span{Key: metaKey.Key}, Status: roachpb.PENDING, Txn: *meta.Txn}}, + // the reader will have to act on this if the intent has a low enough + // timestamp. + // + // If we don't consider the reading transaction's uncertainty interval + // then intents are only visible to it at or below the read timestamp. + // However, if the reader transaction's read timestamp is less than the + // max timestamp seen by the transaction then intents are visible to the + // transaction all the way up to its max timestamp. + maxVisibleTimestamp := timestamp + if checkUncertainty { + maxVisibleTimestamp.Forward(txn.MaxTimestamp) + } + if !maxVisibleTimestamp.Less(metaTimestamp) { + return nil, nil, safeValue, &roachpb.WriteIntentError{ + Intents: []roachpb.Intent{{ + Span: roachpb.Span{Key: metaKey.Key}, Status: roachpb.PENDING, Txn: *meta.Txn, + }}, + } } } - var checkValueTimestamp bool seekKey := metaKey - + checkValueTimestamp := false if !timestamp.Less(metaTimestamp) || ownIntent { // We are reading the latest value, which is either an intent written // by this transaction or not an intent at all (so there's no @@ -907,7 +923,7 @@ func mvccGetInternal( seekKey.Timestamp = metaTimestamp.Prev() } } - } else if txn != nil && timestamp.Less(txn.MaxTimestamp) { + } else if checkUncertainty { // In this branch, the latest timestamp is ahead, and so the read of an // "old" value in a transactional context at time (timestamp, MaxTimestamp] // occurs, leading to a clock uncertainty error if a version exists in diff --git a/pkg/storage/engine/mvcc_test.go b/pkg/storage/engine/mvcc_test.go index 12382da28628..a7b8e51f64da 100644 --- a/pkg/storage/engine/mvcc_test.go +++ b/pkg/storage/engine/mvcc_test.go @@ -802,90 +802,198 @@ func TestMVCCGetUncertainty(t *testing.T) { engine := createTestEngine() defer engine.Close() + // Txn with read timestamp 7 and MaxTimestamp 10. txn := &roachpb.Transaction{ TxnMeta: enginepb.TxnMeta{ ID: uuid.MakeV4(), - Timestamp: hlc.Timestamp{WallTime: 5}, + Timestamp: hlc.Timestamp{WallTime: 7}, }, MaxTimestamp: hlc.Timestamp{WallTime: 10}, } - // Put a value from the past. + getOptsTxn := MVCCGetOptions{Txn: txn} + scanOptsTxn := MVCCScanOptions{Txn: txn} + + // Same txn but with a lower MaxTimestamp at 7. + txnWithLowerMaxTS := txn.Clone() + txnWithLowerMaxTS.MaxTimestamp = hlc.Timestamp{WallTime: 7} + getOptsTxnWithLowerMaxTS := MVCCGetOptions{Txn: txnWithLowerMaxTS} + scanOptsTxnWithLowerMaxTS := MVCCScanOptions{Txn: txnWithLowerMaxTS} + + // Case 1: One value in the past, one value in the future of read + // and ahead of MaxTimestamp of read. Neither should interfere. + // + // ----------------- + // - 12: val2 + // | + // - 10: max timestamp + // | + // - 7: read timestamp + // | + // - 1: val1 + // ----------------- if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 1}, value1, nil); err != nil { t.Fatal(err) } - // Put a value that is ahead of MaxTimestamp, it should not interfere. if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 12}, value2, nil); err != nil { t.Fatal(err) } // Read with transaction, should get a value back. - val, _, err := mvccGet(ctx, engine, testKey1, hlc.Timestamp{WallTime: 7}, MVCCGetOptions{ - Txn: txn, - }) - if err != nil { + if val, _, err := mvccGet(ctx, engine, testKey1, hlc.Timestamp{WallTime: 7}, getOptsTxn); err != nil { t.Fatal(err) + } else if val == nil || !bytes.Equal(val.RawBytes, value1.RawBytes) { + t.Fatalf("wanted %q, got %v", value1.RawBytes, val) } - if val == nil || !bytes.Equal(val.RawBytes, value1.RawBytes) { + if kvs, _, _, err := MVCCScan( + ctx, engine, testKey1, testKey1.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxn, + ); err != nil { + t.Fatal(err) + } else if len(kvs) != 1 { + t.Fatalf("wanted 1 kv, got %d", len(kvs)) + } else if val := kvs[0].Value; !bytes.Equal(val.RawBytes, value1.RawBytes) { t.Fatalf("wanted %q, got %v", value1.RawBytes, val) } - // Now using testKey2. - // Put a value that conflicts with MaxTimestamp. + // Case 2a: One value in the future of read but below MaxTimestamp + // of read. Should result in a ReadWithinUncertaintyIntervalError + // when reading. + // + // ----------------- + // - 10: max timestamp + // - 9: val2 + // | + // - 7: read timestamp + // ----------------- if err := MVCCPut(ctx, engine, nil, testKey2, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { t.Fatal(err) } // Read with transaction, should get error back. - if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, MVCCGetOptions{ - Txn: txn, - }); err == nil { + if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, getOptsTxn); err == nil { t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } if _, _, _, err := MVCCScan( - ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, MVCCScanOptions{Txn: txn}, + ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxn, ); err == nil { t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } - // Adjust MaxTimestamp and retry. - txn.MaxTimestamp = hlc.Timestamp{WallTime: 7} - if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, MVCCGetOptions{ - Txn: txn, - }); err != nil { + // Case 2b: Reduce MaxTimestamp below value in future. Value should + // no longer interfere when reading. + // + // ----------------- + // - 9: val2 + // | + // - 7: read/max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, getOptsTxnWithLowerMaxTS); err != nil { t.Fatal(err) } if _, _, _, err := MVCCScan( - ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, MVCCScanOptions{Txn: txn}, + ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxnWithLowerMaxTS, ); err != nil { t.Fatal(err) } - txn.MaxTimestamp = hlc.Timestamp{WallTime: 10} - // Now using testKey3. - // Put a value that conflicts with MaxTimestamp and another write further - // ahead and not conflicting any longer. The first write should still ruin - // it. - if err := MVCCPut(ctx, engine, nil, testKey3, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { - t.Fatal(err) + // Case 3a: One intent in the future of read but below MaxTimestamp + // of read. Should result in a WriteIntentError when reading. + // + // ----------------- + // - 10: max timestamp + // - 9: val2 (intent) + // | + // - 7: read timestamp + // ----------------- + txn2 := &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + ID: uuid.MakeV4(), + Timestamp: hlc.Timestamp{WallTime: 9}, + }, + OrigTimestamp: hlc.Timestamp{WallTime: 9}, } - if err := MVCCPut(ctx, engine, nil, testKey3, hlc.Timestamp{WallTime: 99}, value2, nil); err != nil { + if err := MVCCPut(ctx, engine, nil, testKey3, hlc.Timestamp{WallTime: 9}, value2, txn2); err != nil { t.Fatal(err) } + // Read with transaction, should get error back. + if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, getOptsTxn); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } if _, _, _, err := MVCCScan( - ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, MVCCScanOptions{Txn: txn}, + ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxn, ); err == nil { t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } + // Case 3b: Reduce MaxTimestamp below intent in future. Intent should + // no longer interfere when reading. + // + // ----------------- + // - 9: val2 (intent) + // | + // - 7: read/max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, getOptsTxnWithLowerMaxTS); err != nil { + t.Fatal(err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxnWithLowerMaxTS, + ); err != nil { + t.Fatal(err) + } + + // Case 4a: Two values in future of read. One is ahead of + // MaxTimestamp of read and one is below MaxTimestamp of read. The + // value within the read's uncertainty interval should result in a + // ReadWithinUncertaintyIntervalError when reading. + // + // ----------------- + // - 99: val3 + // | + // - 10: max timestamp + // - 9: val2 + // | + // - 7: read timestamp + // ----------------- + if err := MVCCPut(ctx, engine, nil, testKey4, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { + t.Fatal(err) + } + if err := MVCCPut(ctx, engine, nil, testKey4, hlc.Timestamp{WallTime: 99}, value3, nil); err != nil { + t.Fatal(err) + } + if _, _, err := mvccGet(ctx, engine, testKey4, hlc.Timestamp{WallTime: 7}, getOptsTxn); err == nil { + t.Fatalf("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } - if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, MVCCGetOptions{ - Txn: txn, - }); err == nil { - t.Fatalf("wanted an error") + if _, _, _, err := MVCCScan( + ctx, engine, testKey4, testKey4.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxn, + ); err == nil { + t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } + // Case 4b: Reduce MaxTimestamp below second value in future. Value should + // no longer interfere when reading. + // + // ----------------- + // - 99: val3 + // | + // - 9: val2 + // | + // - 7: read/max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey4, hlc.Timestamp{WallTime: 7}, getOptsTxnWithLowerMaxTS); err != nil { + t.Fatal(err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey4, testKey4.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, scanOptsTxnWithLowerMaxTS, + ); err != nil { + t.Fatal(err) + } }) } }