diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index 4d98b6c5bd9e..0b1340a09ff8 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -751,6 +751,14 @@ DBStatus DBSstFileWriterAdd(DBSstFileWriter* fw, DBKey key, DBSlice val) { return kSuccess; } +DBStatus DBSstFileWriterDelete(DBSstFileWriter* fw, DBKey key) { + rocksdb::Status status = fw->rep.Delete(EncodeKey(key)); + if (!status.ok()) { + return ToDBStatus(status); + } + return kSuccess; +} + DBStatus DBSstFileWriterFinish(DBSstFileWriter* fw, DBString* data) { rocksdb::Status status = fw->rep.Finish(); if (!status.ok()) { diff --git a/c-deps/libroach/include/libroach.h b/c-deps/libroach/include/libroach.h index 4e26d939c93c..cbc396984b24 100644 --- a/c-deps/libroach/include/libroach.h +++ b/c-deps/libroach/include/libroach.h @@ -408,6 +408,9 @@ DBStatus DBSstFileWriterOpen(DBSstFileWriter* fw); // cannot have been called. DBStatus DBSstFileWriterAdd(DBSstFileWriter* fw, DBKey key, DBSlice val); +// Adds a deletion tombstone to the sstable being built. See DBSstFileWriterAdd for more. +DBStatus DBSstFileWriterDelete(DBSstFileWriter* fw, DBKey key); + // Finalizes the writer and stores the constructed file's contents in *data. At // least one kv entry must have been added. May only be called once. DBStatus DBSstFileWriterFinish(DBSstFileWriter* fw, DBString* data); diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index c615d393e286..eac4cd114f4a 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -204,6 +204,10 @@ template class mvccScanner { return seekVersion(timestamp_, false); } + if (cur_value_.size() == 0) { + return setStatus(FmtStatus("zero-length mvcc metadata")); + } + if (!meta_.ParseFromArray(cur_value_.data(), cur_value_.size())) { return setStatus(FmtStatus("unable to decode MVCCMetadata")); } diff --git a/pkg/ccl/storageccl/engineccl/mvcc_test.go b/pkg/ccl/storageccl/engineccl/mvcc_test.go index 123b706a353a..564ea348e256 100644 --- a/pkg/ccl/storageccl/engineccl/mvcc_test.go +++ b/pkg/ccl/storageccl/engineccl/mvcc_test.go @@ -388,7 +388,7 @@ func TestMVCCIncrementalIteratorIntentStraddlesSStables(t *testing.T) { ingest := func(it engine.Iterator, count int) { sst, err := engine.MakeRocksDBSstFileWriter() if err != nil { - t.Fatal(sst) + t.Fatal(err) } defer sst.Close() diff --git a/pkg/storage/batcheval/cmd_end_transaction.go b/pkg/storage/batcheval/cmd_end_transaction.go index de9aaa3c6c3c..61ae6b1b4d99 100644 --- a/pkg/storage/batcheval/cmd_end_transaction.go +++ b/pkg/storage/batcheval/cmd_end_transaction.go @@ -438,11 +438,8 @@ func resolveLocalIntents( desc = &mergeTrigger.LeftDesc } - min, max := txn.InclusiveTimeBounds() iter := batch.NewIterator(engine.IterOptions{ - MinTimestampHint: min, - MaxTimestampHint: max, - UpperBound: desc.EndKey.AsRawKey(), + UpperBound: desc.EndKey.AsRawKey(), }) iterAndBuf := engine.GetBufUsingIter(iter) defer iterAndBuf.Cleanup() diff --git a/pkg/storage/batcheval/cmd_refresh_range.go b/pkg/storage/batcheval/cmd_refresh_range.go index c458f356248f..932e1790aca6 100644 --- a/pkg/storage/batcheval/cmd_refresh_range.go +++ b/pkg/storage/batcheval/cmd_refresh_range.go @@ -42,12 +42,8 @@ func RefreshRange( return result.Result{}, errors.Errorf("no transaction specified to %s", args.Method()) } - // Use a time-bounded iterator to avoid unnecessarily iterating over - // older data. iter := batch.NewIterator(engine.IterOptions{ - MinTimestampHint: h.Txn.OrigTimestamp, - MaxTimestampHint: h.Txn.Timestamp, - UpperBound: args.EndKey, + UpperBound: args.EndKey, }) defer iter.Close() // Iterate over values until we discover any value written at or @@ -87,33 +83,6 @@ func RefreshRange( if i.Txn.ID == h.Txn.ID { continue } - - if i.Span.EndKey != nil { - return result.Result{}, errors.Errorf("unexpected range intent from MVCC storage") - } - - // HACK(bdarnell): Time-bound iterators can return intents that - // shouldn't be there - // (https://github.com/cockroachdb/cockroach/issues/28358), and - // this can result in stalled traffic when it occurs in this - // method (https://github.com/cockroachdb/cockroach/issues/31823). - // When we get an intent, check with a regular iterator to ensure - // that it's really there. - _, realIntents, err := engine.MVCCGetWithTombstone( - ctx, - batch, - i.Span.Key, - h.Txn.Timestamp, - false, /* consistent */ - nil, /* txn */ - ) - if err != nil { - return result.Result{}, err - } - if len(realIntents) == 0 { - continue - } - // Return an error if an intent was written to the span. return result.Result{}, errors.Errorf("encountered recently written intent %s @%s", i.Span.Key, i.Txn.Timestamp) diff --git a/pkg/storage/batcheval/cmd_refresh_range_test.go b/pkg/storage/batcheval/cmd_refresh_range_test.go new file mode 100644 index 000000000000..444448a7eea1 --- /dev/null +++ b/pkg/storage/batcheval/cmd_refresh_range_test.go @@ -0,0 +1,200 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. See the AUTHORS file +// for names of contributors. + +package batcheval + +import ( + "context" + "fmt" + "testing" + + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage/engine" + "github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/uuid" +) + +// TestRefreshRangeTimeBoundIterator is a regression test for +// https://github.com/cockroachdb/cockroach/issues/31823. RefreshRange +// uses a time-bound iterator, which has a bug that can cause old +// resolved intents to incorrectly appear to be pending. This test +// constructs the necessary arrangement of sstables to reproduce the +// bug and ensures that the workaround (and later, the permanent fix) +// are effective. +// +// The bug is that resolving an intent does not contribute to the +// sstable's timestamp bounds, so that if there is no other +// timestamped data expanding the bounds, time-bound iterators may +// open fewer sstables than necessary and only see the intent, not its +// resolution. +// +// This test creates two sstables. The first contains a pending intent +// at ts1 and another key at ts4, giving it timestamp bounds 1-4 (and +// putting it in scope for transactions at timestamps higher than +// ts1). +func TestRefreshRangeTimeBoundIterator(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + k := roachpb.Key("a") + v := roachpb.MakeValueFromString("hi") + ts1 := hlc.Timestamp{WallTime: 1} + ts2 := hlc.Timestamp{WallTime: 2} + ts3 := hlc.Timestamp{WallTime: 3} + ts4 := hlc.Timestamp{WallTime: 4} + + // Create an sstable containing an unresolved intent. To reduce the + // amount of knowledge of MVCC internals we must embed here, we + // write to a temporary engine and extract the RocksDB KV data. The + // sstable also contains an unrelated key at a higher timestamp to + // widen its bounds. + intentSSTContents := func() []byte { + db := engine.NewInMem(roachpb.Attributes{}, 10<<20) + defer db.Close() + + txn := &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + Key: k, + ID: uuid.MakeV4(), + Epoch: 1, + Timestamp: ts1, + }, + } + if err := engine.MVCCPut(ctx, db, nil, k, txn.Timestamp, v, txn); err != nil { + t.Fatal(err) + } + + if err := engine.MVCCPut(ctx, db, nil, roachpb.Key("unused1"), ts4, v, nil); err != nil { + t.Fatal(err) + } + + sstWriter, err := engine.MakeRocksDBSstFileWriter() + if err != nil { + t.Fatal(err) + } + it := db.NewIterator(engine.IterOptions{ + UpperBound: keys.MaxKey, + }) + defer it.Close() + it.Seek(engine.MVCCKey{Key: keys.MinKey}) + for { + ok, err := it.Valid() + if err != nil { + t.Fatal(err) + } + if !ok { + break + } + if err := sstWriter.Add(engine.MVCCKeyValue{Key: it.Key(), Value: it.Value()}); err != nil { + t.Fatal(err) + } + it.Next() + } + + sstContents, err := sstWriter.Finish() + if err != nil { + t.Fatal(err) + } + + return sstContents + }() + + // Create a second sstable containing the resolution of the intent + // (committed). This is a rocksdb tombstone and there's no good way + // to construct that as we did above, so we do it by hand. The + // sstable also has a second write at a different (older) timestamp, + // because if it were empty other than the deletion tombstone, it + // would not have any timestamp bounds and would be selected for + // every read. + resolveSSTContents := func() []byte { + sstWriter, err := engine.MakeRocksDBSstFileWriter() + if err != nil { + t.Fatal(err) + } + if err := sstWriter.Delete(engine.MakeMVCCMetadataKey(k)); err != nil { + t.Fatal(err) + } + if err := sstWriter.Add(engine.MVCCKeyValue{ + Key: engine.MVCCKey{ + Key: roachpb.Key("unused2"), + Timestamp: ts1, + }, + Value: nil, + }); err != nil { + t.Fatal(err) + } + sstContents, err := sstWriter.Finish() + if err != nil { + t.Fatal(err) + } + return sstContents + }() + + // Create a new DB and ingest our two sstables. + db := engine.NewInMem(roachpb.Attributes{}, 10<<20) + defer db.Close() + + for i, contents := range [][]byte{intentSSTContents, resolveSSTContents} { + filename := fmt.Sprintf("intent-%d", i) + if err := db.WriteFile(filename, contents); err != nil { + t.Fatal(err) + } + if err := db.IngestExternalFiles(ctx, []string{filename}, true); err != nil { + t.Fatal(err) + } + } + + // We should now have a committed value at k@ts1. Read it back to make + // sure our fake intent resolution did the right thing. + if val, intents, err := engine.MVCCGet(ctx, db, k, ts1, true, nil); err != nil { + t.Fatal(err) + } else if len(intents) > 0 { + t.Fatalf("got unexpected intents: %v", intents) + } else if !val.EqualData(v) { + t.Fatalf("expected %s, got %s", v, val) + } + + // Now the real test: a transaction at ts2 has been pushed to ts3 + // and must refresh. It overlaps with our committed intent on k@ts1, + // which is fine because our timestamp is higher (but if that intent + // were still pending, the new txn would be blocked). Prior to + // https://github.com/cockroachdb/cockroach/pull/32211, a bug in the + // time-bound iterator meant that we would see the first sstable but + // not the second and incorrectly report the intent as pending, + // resulting in an error from RefreshRange. + var resp roachpb.RefreshRangeResponse + _, err := RefreshRange(ctx, db, CommandArgs{ + Args: &roachpb.RefreshRangeRequest{ + RequestHeader: roachpb.RequestHeader{ + Key: k, + EndKey: keys.MaxKey, + }, + }, + Header: roachpb.Header{ + Txn: &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + Timestamp: ts3, + }, + OrigTimestamp: ts2, + }, + }, + }, &resp) + if err != nil { + t.Fatal(err) + } +} diff --git a/pkg/storage/batcheval/cmd_resolve_intent_range.go b/pkg/storage/batcheval/cmd_resolve_intent_range.go index 4696f47fefb9..9ff395861bc2 100644 --- a/pkg/storage/batcheval/cmd_resolve_intent_range.go +++ b/pkg/storage/batcheval/cmd_resolve_intent_range.go @@ -21,7 +21,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/batcheval/result" "github.com/cockroachdb/cockroach/pkg/storage/engine" "github.com/cockroachdb/cockroach/pkg/storage/spanset" - "github.com/cockroachdb/cockroach/pkg/util/hlc" ) func init() { @@ -53,18 +52,7 @@ func ResolveIntentRange( Status: args.Status, } - // Use a time-bounded iterator as an optimization if indicated. - var iterAndBuf engine.IterAndBuf - if args.MinTimestamp != (hlc.Timestamp{}) { - iter := batch.NewIterator(engine.IterOptions{ - MinTimestampHint: args.MinTimestamp, - MaxTimestampHint: args.IntentTxn.Timestamp, - UpperBound: args.EndKey, - }) - iterAndBuf = engine.GetBufUsingIter(iter) - } else { - iterAndBuf = engine.GetIterAndBuf(batch, engine.IterOptions{UpperBound: args.EndKey}) - } + iterAndBuf := engine.GetIterAndBuf(batch, engine.IterOptions{UpperBound: args.EndKey}) defer iterAndBuf.Cleanup() numKeys, resumeSpan, err := engine.MVCCResolveWriteIntentRangeUsingIter( diff --git a/pkg/storage/engine/rocksdb.go b/pkg/storage/engine/rocksdb.go index ac7565c3de2f..84104352bce9 100644 --- a/pkg/storage/engine/rocksdb.go +++ b/pkg/storage/engine/rocksdb.go @@ -2744,6 +2744,16 @@ func (fw *RocksDBSstFileWriter) Add(kv MVCCKeyValue) error { return statusToError(C.DBSstFileWriterAdd(fw.fw, goToCKey(kv.Key), goToCSlice(kv.Value))) } +// Delete puts a deletion tombstone into the sstable being built. See +// the Add method for more. +func (fw *RocksDBSstFileWriter) Delete(k MVCCKey) error { + if fw.fw == nil { + return errors.New("cannot call Delete on a closed writer") + } + fw.DataSize += int64(len(k.Key)) + return statusToError(C.DBSstFileWriterDelete(fw.fw, goToCKey(k))) +} + // Finish finalizes the writer and returns the constructed file's contents. At // least one kv entry must have been added. func (fw *RocksDBSstFileWriter) Finish() ([]byte, error) {