From d6b00e37ebb899b9c8ff5d35ee4bf7cf7f06d27b Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 9 Sep 2019 11:45:48 -0400 Subject: [PATCH] storage/engine: return WriteIntentError for intents in uncertainty intervals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the most common failure case in all of the following Jepsen failures. I'm not closing them here though, because they also should be failing due to #36431. Would fix #37394. Would fix #37545. Would fix #37930. Would fix #37932. Would fix #37956. Would fix #38126. Would fix #38763. Before this fix, we were not considering intents in a scan's uncertainty interval to be uncertain. This had the potential to cause stale reads because an unresolved intent doesn't indicate that its transaction hasn’t been committed and isn’t a causal ancestor of the scan. This was causing the `jepsen/multi-register` tests to fail, which I had previously incorrectly attributed entirely to #36431. This commit fixes this by returning `WriteIntentError`s for intents when they are above the read timestamp of a scan but below the max timestamp of a scan. This could have also been fixed by returning `ReadWithinUncertaintyIntervalError`s in this situation. Both would eventually have the same effect, but it seems preferable to kick off concurrency control immediately in this situation and only fall back to uncertainty handling for committed values. If the intent ends up being aborted, this could allow the read to avoid moving its timestamp. This commit will need to be backported all the way back to v2.0. Release note (bug fix): Consider intents in a read's uncertainty interval to be uncertain just as if they were committed values. This removes the potential for stale reads when a causally dependent transaction runs into the not-yet resolved intents from a causal ancestor. --- c-deps/libroach/mvcc.h | 5 +- pkg/storage/engine/mvcc.go | 29 ++-- pkg/storage/engine/mvcc_test.go | 245 ++++++++++++++++++++++++++++---- 3 files changed, 243 insertions(+), 36 deletions(-) diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index f388c46f1e1e..714a5fd04756 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -231,7 +231,10 @@ template class mvccScanner { // values); this timestamp is prev_timestamp. const DBTimestamp prev_timestamp = timestamp_ < meta_timestamp ? timestamp_ : PrevTimestamp(meta_timestamp); - if (timestamp_ < meta_timestamp && !own_intent) { + // Intents for other transactions are visible at or below: + // max(txn.max_timestamp, read_timestamp) + const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_; + if (max_visible_timestamp < meta_timestamp && !own_intent) { // 5. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index 30b6070a0612..d2c60026dcbb 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -852,18 +852,29 @@ func mvccGetInternal( timestamp = metaTimestamp.Prev() } - ownIntent := IsIntentOf(*meta, txn) // false if txn == nil - if !timestamp.Less(metaTimestamp) && meta.Txn != nil && !ownIntent { - // Trying to read the last value, but it's another transaction's intent; - // the reader will have to act on this. - return nil, nil, safeValue, &roachpb.WriteIntentError{ - Intents: []roachpb.Intent{{Span: roachpb.Span{Key: metaKey.Key}, Status: roachpb.PENDING, Txn: *meta.Txn}}, + checkUncertainty := txn != nil && timestamp.Less(txn.MaxTimestamp) + isIntent := meta.Txn != nil + ownIntent := IsIntentOf(*meta, txn) // false if !isIntent + if isIntent && !ownIntent { + // Trying to read the last value, but it's another transaction's intent. + // The reader will have to act on this if the intent has a low enough + // timestamp. Intents for other transactions are visible at or below: + // max(txn.MaxTimestamp, timestamp) + maxVisibleTimestamp := timestamp + if checkUncertainty { + maxVisibleTimestamp = txn.MaxTimestamp + } + if !maxVisibleTimestamp.Less(metaTimestamp) { + return nil, nil, safeValue, &roachpb.WriteIntentError{ + Intents: []roachpb.Intent{{ + Span: roachpb.Span{Key: metaKey.Key}, Status: roachpb.PENDING, Txn: *meta.Txn, + }}, + } } } - var checkValueTimestamp bool seekKey := metaKey - + checkValueTimestamp := false if !timestamp.Less(metaTimestamp) || ownIntent { // We are reading the latest value, which is either an intent written // by this transaction or not an intent at all (so there's no @@ -892,7 +903,7 @@ func mvccGetInternal( seekKey.Timestamp = metaTimestamp.Prev() } } - } else if txn != nil && timestamp.Less(txn.MaxTimestamp) { + } else if checkUncertainty { // In this branch, the latest timestamp is ahead, and so the read of an // "old" value in a transactional context at time (timestamp, MaxTimestamp] // occurs, leading to a clock uncertainty error if a version exists in diff --git a/pkg/storage/engine/mvcc_test.go b/pkg/storage/engine/mvcc_test.go index 20ff1a9ab829..78d6ffa848bf 100644 --- a/pkg/storage/engine/mvcc_test.go +++ b/pkg/storage/engine/mvcc_test.go @@ -719,73 +719,266 @@ func TestMVCCGetUncertainty(t *testing.T) { t.Run(impl.name, func(t *testing.T) { mvccGet := impl.fn + ctx := context.Background() engine := createTestEngine() defer engine.Close() - txn := &roachpb.Transaction{TxnMeta: enginepb.TxnMeta{ID: uuid.MakeV4(), Timestamp: hlc.Timestamp{WallTime: 5}}, MaxTimestamp: hlc.Timestamp{WallTime: 10}} - // Put a value from the past. - if err := MVCCPut(context.Background(), engine, nil, testKey1, hlc.Timestamp{WallTime: 1}, value1, nil); err != nil { + // Txn with read timestamp 7 and MaxTimestamp 10. + txn := &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + ID: uuid.MakeV4(), + Timestamp: hlc.Timestamp{WallTime: 7}, + }, + MaxTimestamp: hlc.Timestamp{WallTime: 10}, + } + + // Same txn but with a MaxTimestamp reduced to 9. + txnMaxTS9 := txn.Clone() + txnMaxTS9.MaxTimestamp = hlc.Timestamp{WallTime: 9} + + // Same txn but with a MaxTimestamp reduced to 7. + txnMaxTS7 := txn.Clone() + txnMaxTS7.MaxTimestamp = hlc.Timestamp{WallTime: 7} + + // Case 1: One value in the past, one value in the future of read + // and ahead of MaxTimestamp of read. Neither should interfere. + // + // ----------------- + // - 12: val2 + // | + // - 10: max timestamp + // | + // - 7: read timestamp + // | + // - 1: val1 + // ----------------- + if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 1}, value1, nil); err != nil { t.Fatal(err) } - // Put a value that is ahead of MaxTimestamp, it should not interfere. - if err := MVCCPut(context.Background(), engine, nil, testKey1, hlc.Timestamp{WallTime: 12}, value2, nil); err != nil { + if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 12}, value2, nil); err != nil { t.Fatal(err) } // Read with transaction, should get a value back. - val, _, err := mvccGet(context.Background(), engine, testKey1, hlc.Timestamp{WallTime: 7}, true, txn) - if err != nil { + if val, _, err := mvccGet(ctx, engine, testKey1, hlc.Timestamp{WallTime: 7}, true, txn); err != nil { t.Fatal(err) + } else if val == nil || !bytes.Equal(val.RawBytes, value1.RawBytes) { + t.Fatalf("wanted %q, got %v", value1.RawBytes, val) } - if val == nil || !bytes.Equal(val.RawBytes, value1.RawBytes) { + if kvs, _, _, err := MVCCScan( + ctx, engine, testKey1, testKey1.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn, + ); err != nil { + t.Fatal(err) + } else if len(kvs) != 1 { + t.Fatalf("wanted 1 kv, got %d", len(kvs)) + } else if val := kvs[0].Value; !bytes.Equal(val.RawBytes, value1.RawBytes) { t.Fatalf("wanted %q, got %v", value1.RawBytes, val) } - // Now using testKey2. - // Put a value that conflicts with MaxTimestamp. - if err := MVCCPut(context.Background(), engine, nil, testKey2, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { + // Case 2a: One value in the future of read but below MaxTimestamp + // of read. Should result in a ReadWithinUncertaintyIntervalError + // when reading. + // + // ----------------- + // - 10: max timestamp + // - 9: val2 + // | + // - 7: read timestamp + // ----------------- + if err := MVCCPut(ctx, engine, nil, testKey2, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { t.Fatal(err) } // Read with transaction, should get error back. - if _, _, err := mvccGet(context.Background(), engine, testKey2, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } - if _, _, _, err := MVCCScan(context.Background(), engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + if _, _, _, err := MVCCScan( + ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn, + ); err == nil { t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } - // Adjust MaxTimestamp and retry. - txn.MaxTimestamp = hlc.Timestamp{WallTime: 7} - if _, _, err := mvccGet(context.Background(), engine, testKey2, hlc.Timestamp{WallTime: 7}, true, txn); err != nil { + // Case 2b: Reduce MaxTimestamp to exactly that of value in future. + // Should result in a ReadWithinUncertaintyIntervalError when + // reading. + // + // ----------------- + // - 9: val2 & max timestamp + // | + // - 7: read timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { + t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9, + ); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { + t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) + } + // Case 2c: Reduce MaxTimestamp below value in future. Value should + // no longer interfere when reading. + // + // ----------------- + // - 9: val2 + // | + // - 7: read timestamp & max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey2, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7); err != nil { t.Fatal(err) } - if _, _, _, err := MVCCScan(context.Background(), engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn); err != nil { + if _, _, _, err := MVCCScan( + ctx, engine, testKey2, testKey2.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7, + ); err != nil { t.Fatal(err) } - txn.MaxTimestamp = hlc.Timestamp{WallTime: 10} - // Now using testKey3. - // Put a value that conflicts with MaxTimestamp and another write further - // ahead and not conflicting any longer. The first write should still ruin - // it. - if err := MVCCPut(context.Background(), engine, nil, testKey3, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { + // Case 3a: One intent in the future of read but below MaxTimestamp + // of read. Should result in a WriteIntentError when reading. + // + // ----------------- + // - 10: max timestamp + // - 9: val2 (intent) + // | + // - 7: read timestamp + // ----------------- + intentTxn := &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + ID: uuid.MakeV4(), + Timestamp: hlc.Timestamp{WallTime: 9}, + }, + OrigTimestamp: hlc.Timestamp{WallTime: 9}, + } + if err := MVCCPut(ctx, engine, nil, testKey3, hlc.Timestamp{WallTime: 9}, value2, intentTxn); err != nil { t.Fatal(err) } - if err := MVCCPut(context.Background(), engine, nil, testKey3, hlc.Timestamp{WallTime: 99}, value2, nil); err != nil { + // Read with transaction, should get error back. + if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn, + ); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } + // Case 3b: Reduce MaxTimestamp to exactly that of intent in future. + // Should result in a WriteIntentError when reading. + // + // ----------------- + // - 9: val2 (intent) & max timestamp + // | + // - 7: read timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9, + ); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.WriteIntentError); !ok { + t.Fatalf("wanted a WriteIntentError, got %+v", err) + } + // Case 3c: Reduce MaxTimestamp below intent in future. Intent should + // no longer interfere when reading. + // + // ----------------- + // - 9: val2 (intent) + // | + // - 7: read timestamp & max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey3, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7); err != nil { + t.Fatal(err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7, + ); err != nil { + t.Fatal(err) + } + + // Case 4a: Two values in future of read. One is ahead of + // MaxTimestamp of read and one is below MaxTimestamp of read. The + // value within the read's uncertainty interval should result in a + // ReadWithinUncertaintyIntervalError when reading. + // + // ----------------- + // - 99: val3 + // | + // - 10: max timestamp + // - 9: val2 + // | + // - 7: read timestamp + // ----------------- + if err := MVCCPut(ctx, engine, nil, testKey4, hlc.Timestamp{WallTime: 9}, value2, nil); err != nil { + t.Fatal(err) + } + if err := MVCCPut(ctx, engine, nil, testKey4, hlc.Timestamp{WallTime: 99}, value3, nil); err != nil { t.Fatal(err) } - if _, _, _, err := MVCCScan(context.Background(), engine, testKey3, testKey3.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + if _, _, err := mvccGet(ctx, engine, testKey4, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + t.Fatalf("wanted an error") + } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { + t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey4, testKey4.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, txn, + ); err == nil { t.Fatal("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } - if _, _, err := mvccGet(context.Background(), engine, testKey3, hlc.Timestamp{WallTime: 7}, true, txn); err == nil { + // Case 4b: Reduce MaxTimestamp to exactly that of second value in + // future. The value within the read's uncertainty interval should + // result in a ReadWithinUncertaintyIntervalError when reading. + // + // ----------------- + // - 99: val3 + // | + // - 9: val2 & max timestamp + // | + // - 7: read timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey4, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9); err == nil { t.Fatalf("wanted an error") } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) } + if _, _, _, err := MVCCScan( + ctx, engine, testKey4, testKey4.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS9, + ); err == nil { + t.Fatal("wanted an error") + } else if _, ok := err.(*roachpb.ReadWithinUncertaintyIntervalError); !ok { + t.Fatalf("wanted a ReadWithinUncertaintyIntervalError, got %+v", err) + } + // Case 4c: Reduce MaxTimestamp below second value in future. Value should + // no longer interfere when reading. + // + // ----------------- + // - 99: val3 + // | + // - 9: val2 + // | + // - 7: read timestamp & max timestamp + // ----------------- + if _, _, err := mvccGet(ctx, engine, testKey4, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7); err != nil { + t.Fatal(err) + } + if _, _, _, err := MVCCScan( + ctx, engine, testKey4, testKey4.PrefixEnd(), 10, hlc.Timestamp{WallTime: 7}, true, &txnMaxTS7, + ); err != nil { + t.Fatal(err) + } }) } }