Skip to content

Commit

Permalink
storage: make RaftTombstoneKey unreplicated
Browse files Browse the repository at this point in the history
RaftTombstoneKey was accidentally made a replicated key when it was first
introduced, a problem we first realized existed when it was [included in
snapshots].

At the time, we included workarounds to skip this key in various places
(snapshot application, consistency checker) but of course we have failed to
insert further hacks of the same kind elsewhere since (the one that prompting
this PR being the stats recomputation on splits, which I'm looking into as part
of cockroachdb#20181 -- unfortunately this commit doesn't seem to pertain to that problem)

It feels sloppy that we didn't follow through back then, but luckily the damage
appears to be limited; it is likely that the replicated existence of this key
results in MVCCStats SysBytes inconsistencies, but as it happens, these stats
are [already] [very] [inconsistent].

This commit does a few things:

- renames the old tombstone key to `RaftIncorrectLegacyTombstoneKey`
- introduces a (correctly unreplicated) `RaftTombstoneKey`
- introduces a migration that rewrites all legacy keys early in the node
  start sequence; as a result, a node running this commit or later will
  never see any legacy keys, except when they come in through a snapshot
- when applying a snapshot, forcibly delete any legacy tombstone contained
  within.

`RaftIncorrectLegacyTombstoneKey` can be purged from the codebase in binaries
post v2.0, as at that point all peers have version at least v2.0 (which has this
commit and will never send legacy tombstones in snapshots).

Since sending legacy keys in snapshots was never intended (and isn't relied
upon), none of this needs a cluster version migration.

Fixes cockroachdb#12154.

Release note: None

[included in snapshots]: cockroachdb#12131
[already]: cockroachdb#20554
[very]: cockroachdb#20996
[inconsistent]: cockroachdb#21070
  • Loading branch information
tbg committed Dec 29, 2017
1 parent eb5b878 commit 3277ae5
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 15 deletions.
24 changes: 19 additions & 5 deletions pkg/keys/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,13 @@ func DecodeAbortSpanKey(key roachpb.Key, dest []byte) (uuid.UUID, error) {
return txnID, err
}

// RaftTombstoneKey returns a system-local key for a raft tombstone.
func RaftTombstoneKey(rangeID roachpb.RangeID) roachpb.Key {
return MakeRangeIDPrefixBuf(rangeID).RaftTombstoneKey()
// RaftTombstoneIncorrectLegacyKey returns a system-local key for a raft tombstone.
// This key was accidentally made replicated though it is not, requiring awkward
// workarounds. This method and all users can be removed in any binary version > 2.0.
//
// See https://github.com/cockroachdb/cockroach/issues/12154.
func RaftTombstoneIncorrectLegacyKey(rangeID roachpb.RangeID) roachpb.Key {
return MakeRangeIDPrefixBuf(rangeID).RaftTombstoneIncorrectLegacyKey()
}

// RaftAppliedIndexKey returns a system-local key for a raft applied index.
Expand Down Expand Up @@ -286,6 +290,11 @@ func makeRangeIDUnreplicatedKey(
return key
}

// RaftTombstoneKey returns a system-local key for a raft tombstone.
func RaftTombstoneKey(rangeID roachpb.RangeID) roachpb.Key {
return MakeRangeIDPrefixBuf(rangeID).RaftTombstoneKey()
}

// RaftHardStateKey returns a system-local key for a Raft HardState.
func RaftHardStateKey(rangeID roachpb.RangeID) roachpb.Key {
return MakeRangeIDPrefixBuf(rangeID).RaftHardStateKey()
Expand Down Expand Up @@ -802,8 +811,8 @@ func (b RangeIDPrefixBuf) AbortSpanKey(txnID uuid.UUID) roachpb.Key {
return encoding.EncodeBytesAscending(key, txnID.GetBytes())
}

// RaftTombstoneKey returns a system-local key for a raft tombstone.
func (b RangeIDPrefixBuf) RaftTombstoneKey() roachpb.Key {
// RaftTombstoneIncorrectLegacyKey returns a system-local key for a raft tombstone.
func (b RangeIDPrefixBuf) RaftTombstoneIncorrectLegacyKey() roachpb.Key {
return append(b.replicatedPrefix(), LocalRaftTombstoneSuffix...)
}

Expand Down Expand Up @@ -849,6 +858,11 @@ func (b RangeIDPrefixBuf) RangeTxnSpanGCThresholdKey() roachpb.Key {
return append(b.replicatedPrefix(), LocalTxnSpanGCThresholdSuffix...)
}

// RaftTombstoneKey returns a system-local key for a raft tombstone.
func (b RangeIDPrefixBuf) RaftTombstoneKey() roachpb.Key {
return append(b.unreplicatedPrefix(), LocalRaftTombstoneSuffix...)
}

// RaftHardStateKey returns a system-local key for a Raft HardState.
func (b RangeIDPrefixBuf) RaftHardStateKey() roachpb.Key {
return append(b.unreplicatedPrefix(), LocalRaftHardStateSuffix...)
Expand Down
3 changes: 2 additions & 1 deletion pkg/keys/printer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TestPrettyPrint(t *testing.T) {
{StoreSuggestedCompactionKey(roachpb.Key("a"), MaxKey), `/Local/Store/suggestedCompaction/{"a"-/Max}`},

{AbortSpanKey(roachpb.RangeID(1000001), txnID), fmt.Sprintf(`/Local/RangeID/1000001/r/AbortSpan/%q`, txnID)},
{RaftTombstoneKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/RaftTombstone"},
{RaftTombstoneIncorrectLegacyKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/RaftTombstone"},
{RaftAppliedIndexKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/RaftAppliedIndex"},
{LeaseAppliedIndexKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/LeaseAppliedIndex"},
{RaftTruncatedStateKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/RaftTruncatedState"},
Expand All @@ -64,6 +64,7 @@ func TestPrettyPrint(t *testing.T) {

{RaftHardStateKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/u/RaftHardState"},
{RaftLastIndexKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/u/RaftLastIndex"},
{RaftTombstoneKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/u/RaftTombstone"},
{RaftLogKey(roachpb.RangeID(1000001), uint64(200001)), "/Local/RangeID/1000001/u/RaftLog/logIndex:200001"},
{RangeLastReplicaGCTimestampKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/u/RangeLastReplicaGCTimestamp"},
{RangeLastVerificationTimestampKeyDeprecated(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/u/RangeLastVerificationTimestamp"},
Expand Down
10 changes: 1 addition & 9 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,6 @@ func (r *Replica) sha512(
desc roachpb.RangeDescriptor, snap engine.Reader, snapshot *roachpb.RaftSnapshotData,
) ([]byte, error) {
hasher := sha512.New()
tombstoneKey := engine.MakeMVCCMetadataKey(keys.RaftTombstoneKey(desc.RangeID))

// Iterate over all the data in the range.
iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */)
Expand All @@ -915,15 +914,8 @@ func (r *Replica) sha512(
} else if !ok {
break
}
// TODO(tschottdorf): looks like we'd want to use Unsafe{Key,Value} here when `snapshot == nil` (the common case).
key := iter.Key()
if key.Equal(tombstoneKey) {
// Skip the tombstone key which is marked as replicated even though it
// isn't.
//
// TODO(peter): Figure out a way to migrate this key to the unreplicated
// key space.
continue
}
value := iter.Value()

if snapshot != nil {
Expand Down
10 changes: 10 additions & 0 deletions pkg/storage/replica_raftstorage.go
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,16 @@ func (r *Replica) applySnapshot(
}
}

// Nodes before v2.0 may send an incorrect Raft tombstone (see #12154) that was supposed
// to be unreplicated. Simply remove it.
//
// NB: this can be removed in v2.1. This is because when we are running a binary at
// v2.1, we know that peers are at least running v2.0, which will never send out these
// snapshots.
if err := clearLegacyTombstone(batch, r.RangeID); err != nil {
return errors.Wrap(err, "while clearing legacy tombstone key")
}

// The log entries are all written to distinct keys so we can use a
// distinct batch.
distinctBatch := batch.Distinct()
Expand Down
99 changes: 99 additions & 0 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,72 @@ func (s *Store) IsStarted() bool {
return atomic.LoadInt32(&s.started) == 1
}

// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing ( such as
// RaftHardStateKey, RaftTombstoneKey, and many others). Such keys could in principle exist at any
// RangeID, and this helper efficiently discovers all the keys of the desired type (as specified by
// the supplied `keyFn`) and, for each key-value pair discovered, unmarshals it into `msg` and then
// invokes `f`.
//
// Iteration stops on the first error (and will pass through that error).
func IterateIDPrefixKeys(
ctx context.Context,
eng engine.Reader,
keyFn func(roachpb.RangeID) roachpb.Key,
msg protoutil.Message,
f func(_ roachpb.RangeID) (more bool, _ error),
) error {
rangeID := roachpb.RangeID(1)
iter := eng.NewIterator(false /* prefix */)

for {
mvccKey := engine.MakeMVCCMetadataKey(keyFn(rangeID))
iter.Seek(mvccKey)

if ok, err := iter.Valid(); !ok {
return err
}

unsafeKey := iter.UnsafeKey()

if !bytes.HasPrefix(unsafeKey.Key, keys.LocalRangeIDPrefix) {
// Left the local keyspace, so we're done.
return nil
}

curRangeID, _, _, _, err := keys.DecodeRangeIDKey(mvccKey.Key)
if err != nil {
return err
}

if curRangeID > rangeID {
rangeID = curRangeID
mvccKey = engine.MakeMVCCMetadataKey(keyFn(rangeID))
}

if !unsafeKey.Key.Equal(mvccKey.Key) {
rangeID++
continue
}

ok, err := engine.MVCCGetProto(
ctx, eng, unsafeKey.Key, hlc.Timestamp{}, true /* consistent */, nil /* txn */, msg,
)
if err != nil {
return err
}
if !ok {
return errors.Errorf("unable to unmarshal %s into %T", unsafeKey.Key, msg)
}

log.Infof(ctx, "calling for %d %s %+v", rangeID, unsafeKey.Key, msg)
more, err := f(rangeID)
if !more || err != nil {
return err
}
rangeID++
}
}

// IterateRangeDescriptors calls the provided function with each descriptor
// from the provided Engine. The return values of this method and fn have
// semantics similar to engine.MVCCIterate.
Expand Down Expand Up @@ -1047,6 +1113,35 @@ func IterateRangeDescriptors(
return err
}

// clearLegacyTombstone removes the legacy tombstone for the given rangeID, taking
// care to do this without reading from the engine (as is required by one of the
// callers).
func clearLegacyTombstone(eng engine.Writer, rangeID roachpb.RangeID) error {
return eng.Clear(engine.MakeMVCCMetadataKey(keys.RaftTombstoneIncorrectLegacyKey(rangeID)))
}

// migrateLegacyTombstones rewrites all legacy tombstones into the correct key.
// It can be removed in binaries post v2.0.
func migrateLegacyTombstones(ctx context.Context, eng engine.Engine) error {
var tombstone roachpb.RaftTombstone
handleTombstone := func(rangeID roachpb.RangeID) (more bool, _ error) {
batch := eng.NewBatch()
defer batch.Close()

tombstoneKey := keys.RaftTombstoneKey(rangeID)
if err := engine.MVCCPutProto(ctx, batch, nil, tombstoneKey,
hlc.Timestamp{}, nil, &tombstone); err != nil {
return false, err
}
if err := clearLegacyTombstone(batch, rangeID); err != nil {
return false, err
}
err := batch.Commit(true /* sync */)
return err == nil, err
}
return IterateIDPrefixKeys(ctx, eng, keys.RaftTombstoneIncorrectLegacyKey, &tombstone, handleTombstone)
}

// ReadStoreIdent reads the StoreIdent from the store.
// It returns *NotBootstrappedError if the ident is missing (meaning that the
// store needs to be bootstrapped).
Expand Down Expand Up @@ -1103,6 +1198,10 @@ func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error {
now := s.cfg.Clock.Now()
s.startedAt = now.WallTime

if err := migrateLegacyTombstones(ctx, s.engine); err != nil {
return errors.Wrap(err, "migrating legacy tombstones")
}

// Iterate over all range descriptors, ignoring uncommitted versions
// (consistent=false). Uncommitted intents which have been abandoned
// due to a split crashing halfway will simply be resolved on the
Expand Down
Loading

0 comments on commit 3277ae5

Please sign in to comment.