Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver/loqrecovery: record and post replica recovery events #73785

Merged
merged 1 commit into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/generated/eventlog.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,30 @@ e.g. directly access and mutate internal state, breaking system invariants.
Events in this category are logged to the `OPS` channel.


### `debug_recover_replica`

An event of type `debug_recover_replica` is recorded when unsafe loss of quorum recovery is performed.


| Field | Description | Sensitive |
|--|--|--|
| `RangeID` | | no |
| `StoreID` | | no |
| `SurvivorReplicaID` | | no |
| `UpdatedReplicaID` | | no |
| `StartKey` | | yes |
| `EndKey` | | yes |


#### Common fields

| Field | Description | Sensitive |
|--|--|--|
| `Timestamp` | The timestamp of the event. Expressed as nanoseconds since the Unix epoch. | no |
| `EventType` | The type of the event. | no |
| `NodeID` | The node ID where the event originated. | no |
| `User` | The user which performed the operation. | yes |

### `debug_send_kv_batch`

An event of type `debug_send_kv_batch` is recorded when an arbitrary KV BatchRequest is submitted
Expand Down
13 changes: 8 additions & 5 deletions pkg/cli/debug_recover_loss_of_quorum.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/spf13/cobra"
)
Expand Down Expand Up @@ -411,16 +413,16 @@ func runDebugExecuteRecoverPlan(cmd *cobra.Command, args []string) error {
var localNodeID roachpb.NodeID
batches := make(map[roachpb.StoreID]storage.Batch)
for _, storeSpec := range debugRecoverExecuteOpts.Stores.Specs {
db, err := OpenExistingStore(storeSpec.Path, stopper, false /* readOnly */)
store, err := OpenExistingStore(storeSpec.Path, stopper, false /* readOnly */)
if err != nil {
return errors.Wrapf(err, "failed to open store at path %q. ensure that store path is "+
"correct and that it is not used by another process", storeSpec.Path)
}
batch := db.NewBatch()
defer db.Close()
batch := store.NewBatch()
defer store.Close()
defer batch.Close()

storeIdent, err := kvserver.ReadStoreIdent(cmd.Context(), db)
storeIdent, err := kvserver.ReadStoreIdent(cmd.Context(), store)
if err != nil {
return err
}
Expand All @@ -434,8 +436,9 @@ func runDebugExecuteRecoverPlan(cmd *cobra.Command, args []string) error {
batches[storeIdent.StoreID] = batch
}

updateTime := timeutil.Now()
prepReport, err := loqrecovery.PrepareUpdateReplicas(
cmd.Context(), nodeUpdates, localNodeID, batches)
cmd.Context(), nodeUpdates, uuid.DefaultGenerator, updateTime, localNodeID, batches)
if err != nil {
return err
}
Expand Down
31 changes: 25 additions & 6 deletions pkg/keys/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ const (
tenantPrefixByte = '\xfe'
)

// Constants to subdivide unsafe loss of quorum recovery data into groups.
// Currently we only store keys as they are applied, but might benefit from
// archiving them to make them more "durable".
const (
appliedUnsafeReplicaRecoveryPrefix = "applied"
)

// Constants for system-reserved keys in the KV map.
//
// Note: Preserve group-wise ordering when adding new constants.
Expand Down Expand Up @@ -160,9 +167,27 @@ var (
// localStoreIdentSuffix stores an immutable identifier for this
// store, created when the store is first bootstrapped.
localStoreIdentSuffix = []byte("iden")
// LocalStoreUnsafeReplicaRecoverySuffix is a suffix for temporary record
// entries put when loss of quorum recovery operations are performed offline
// on the store.
// See StoreUnsafeReplicaRecoveryKey for details.
localStoreUnsafeReplicaRecoverySuffix = makeKey([]byte("loqr"),
[]byte(appliedUnsafeReplicaRecoveryPrefix))
// LocalStoreUnsafeReplicaRecoveryKeyMin is the start of keyspace used to store
// loss of quorum recovery record entries.
LocalStoreUnsafeReplicaRecoveryKeyMin = MakeStoreKey(localStoreUnsafeReplicaRecoverySuffix, nil)
// LocalStoreUnsafeReplicaRecoveryKeyMax is the end of keyspace used to store
// loss of quorum recovery record entries.
LocalStoreUnsafeReplicaRecoveryKeyMax = LocalStoreUnsafeReplicaRecoveryKeyMin.PrefixEnd()
// localStoreNodeTombstoneSuffix stores key value pairs that map
// nodeIDs to time of removal from cluster.
localStoreNodeTombstoneSuffix = []byte("ntmb")
// localStoreCachedSettingsSuffix stores the cached settings for node.
localStoreCachedSettingsSuffix = []byte("stng")
// LocalStoreCachedSettingsKeyMin is the start of span of possible cached settings keys.
LocalStoreCachedSettingsKeyMin = MakeStoreKey(localStoreCachedSettingsSuffix, nil)
// LocalStoreCachedSettingsKeyMax is the end of span of possible cached settings keys.
LocalStoreCachedSettingsKeyMax = LocalStoreCachedSettingsKeyMin.PrefixEnd()
// localStoreLastUpSuffix stores the last timestamp that a store's node
// acknowledged that it was still running. This value will be regularly
// refreshed on all stores for a running node; the intention of this value
Expand All @@ -172,12 +197,6 @@ var (
// localRemovedLeakedRaftEntriesSuffix is DEPRECATED and remains to prevent
// reuse.
localRemovedLeakedRaftEntriesSuffix = []byte("dlre")
// localStoreCachedSettingsSuffix stores the cached settings for node.
localStoreCachedSettingsSuffix = []byte("stng")
// LocalStoreCachedSettingsKeyMin is the start of span of possible cached settings keys.
LocalStoreCachedSettingsKeyMin = MakeStoreKey(localStoreCachedSettingsSuffix, nil)
// LocalStoreCachedSettingsKeyMax is the end of span of possible cached settings keys.
LocalStoreCachedSettingsKeyMax = LocalStoreCachedSettingsKeyMin.PrefixEnd()

// 5. Lock table keys
//
Expand Down
15 changes: 8 additions & 7 deletions pkg/keys/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,14 @@ var _ = [...]interface{}{
// 4. Store local keys: These contain metadata about an individual store.
// They are unreplicated and unaddressable. The typical example is the
// store 'ident' record. They all share `localStorePrefix`.
StoreClusterVersionKey, // "cver"
StoreGossipKey, // "goss"
StoreHLCUpperBoundKey, // "hlcu"
StoreIdentKey, // "iden"
StoreNodeTombstoneKey, // "ntmb"
StoreLastUpKey, // "uptm"
StoreCachedSettingsKey, // "stng"
StoreClusterVersionKey, // "cver"
StoreGossipKey, // "goss"
StoreHLCUpperBoundKey, // "hlcu"
StoreIdentKey, // "iden"
StoreUnsafeReplicaRecoveryKey, // "loqr"
StoreNodeTombstoneKey, // "ntmb"
StoreCachedSettingsKey, // "stng"
StoreLastUpKey, // "uptm"

// 5. Range lock keys for all replicated locks. All range locks share
// LocalRangeLockTablePrefix. Locks can be acquired on global keys and on
Expand Down
27 changes: 27 additions & 0 deletions pkg/keys/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,33 @@ func DecodeStoreCachedSettingsKey(key roachpb.Key) (settingKey roachpb.Key, err
return
}

// StoreUnsafeReplicaRecoveryKey creates a key for loss of quorum replica
// recovery entry. Those keys are written by `debug recover apply-plan` command
// on the store while node is stopped. Once node boots up, entries are
// translated into structured log events to leave audit trail of recovery
// operation.
func StoreUnsafeReplicaRecoveryKey(uuid uuid.UUID) roachpb.Key {
key := make(roachpb.Key, 0, len(LocalStoreUnsafeReplicaRecoveryKeyMin)+len(uuid))
key = append(key, LocalStoreUnsafeReplicaRecoveryKeyMin...)
key = append(key, uuid.GetBytes()...)
return key
}

// DecodeStoreUnsafeReplicaRecoveryKey decodes uuid key used to create record
// key for unsafe replica recovery record.
func DecodeStoreUnsafeReplicaRecoveryKey(key roachpb.Key) (uuid.UUID, error) {
if !bytes.HasPrefix(key, LocalStoreUnsafeReplicaRecoveryKeyMin) {
return uuid.UUID{},
errors.Errorf("key %q does not have %q prefix", string(key), LocalRangeIDPrefix)
}
remainder := key[len(LocalStoreUnsafeReplicaRecoveryKeyMin):]
entryID, err := uuid.FromBytes(remainder)
if err != nil {
return entryID, errors.Wrap(err, "failed to get uuid from unsafe replica recovery key")
}
return entryID, nil
}

// NodeLivenessKey returns the key for the node liveness record.
func NodeLivenessKey(nodeID roachpb.NodeID) roachpb.Key {
key := make(roachpb.Key, 0, len(NodeLivenessPrefix)+9)
Expand Down
22 changes: 21 additions & 1 deletion pkg/keys/printer.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ var constSubKeyDict = []struct {
{"/clusterVersion", localStoreClusterVersionSuffix},
{"/nodeTombstone", localStoreNodeTombstoneSuffix},
{"/cachedSettings", localStoreCachedSettingsSuffix},
{"/lossOfQuorumRecovery/applied", localStoreUnsafeReplicaRecoverySuffix},
}

func nodeTombstoneKeyPrint(key roachpb.Key) string {
Expand Down Expand Up @@ -223,6 +224,10 @@ func localStoreKeyPrint(_ []encoding.Direction, key roachpb.Key) string {
return v.name + "/" + cachedSettingsKeyPrint(
append(roachpb.Key(nil), append(LocalStorePrefix, key...)...),
)
} else if v.key.Equal(localStoreUnsafeReplicaRecoverySuffix) {
return v.name + "/" + lossOfQuorumRecoveryEntryKeyPrint(
append(roachpb.Key(nil), append(LocalStorePrefix, key...)...),
)
}
return v.name
}
Expand All @@ -231,6 +236,14 @@ func localStoreKeyPrint(_ []encoding.Direction, key roachpb.Key) string {
return fmt.Sprintf("%q", []byte(key))
}

func lossOfQuorumRecoveryEntryKeyPrint(key roachpb.Key) string {
entryID, err := DecodeStoreUnsafeReplicaRecoveryKey(key)
if err != nil {
return fmt.Sprintf("<invalid: %s>", err)
}
return entryID.String()
}

func localStoreKeyParse(input string) (remainder string, output roachpb.Key) {
for _, s := range constSubKeyDict {
if strings.HasPrefix(input, s.name) {
Expand All @@ -239,9 +252,16 @@ func localStoreKeyParse(input string) (remainder string, output roachpb.Key) {
s.key.Equal(localStoreNodeTombstoneSuffix),
s.key.Equal(localStoreCachedSettingsSuffix):
panic(&ErrUglifyUnsupported{errors.Errorf("cannot parse local store key with suffix %s", s.key)})
case s.key.Equal(localStoreUnsafeReplicaRecoverySuffix):
recordIDString := input[len(localStoreUnsafeReplicaRecoverySuffix):]
recordUUID, err := uuid.FromString(recordIDString)
if err != nil {
panic(&ErrUglifyUnsupported{errors.Errorf("cannot parse local store key with suffix %s", s.key)})
}
output = StoreUnsafeReplicaRecoveryKey(recordUUID)
default:
output = MakeStoreKey(s.key, nil)
}
output = MakeStoreKey(s.key, nil)
return
}
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/keys/printer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func TestPrettyPrint(t *testing.T) {
durationDesc, _ := encoding.EncodeDurationDescending(nil, duration)
bitArray := bitarray.MakeBitArrayFromInt64(8, 58, 7)
txnID := uuid.MakeV4()
loqRecoveryID := uuid.MakeV4()

// Support for asserting that the ugly printer supports a key was added after
// most of the tests here were written.
Expand All @@ -66,6 +67,7 @@ func TestPrettyPrint(t *testing.T) {
{keys.StoreClusterVersionKey(), "/Local/Store/clusterVersion", revertSupportUnknown},
{keys.StoreNodeTombstoneKey(123), "/Local/Store/nodeTombstone/n123", revertSupportUnknown},
{keys.StoreCachedSettingsKey(roachpb.Key("a")), `/Local/Store/cachedSettings/"a"`, revertSupportUnknown},
{keys.StoreUnsafeReplicaRecoveryKey(loqRecoveryID), fmt.Sprintf(`/Local/Store/lossOfQuorumRecovery/applied/%s`, loqRecoveryID), revertSupportUnknown},

{keys.AbortSpanKey(roachpb.RangeID(1000001), txnID), fmt.Sprintf(`/Local/RangeID/1000001/r/AbortSpan/%q`, txnID), revertSupportUnknown},
{keys.RangeAppliedStateKey(roachpb.RangeID(1000001)), "/Local/RangeID/1000001/r/RangeAppliedState", revertSupportUnknown},
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/loqrecovery/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ go_library(
"apply.go",
"collect.go",
"plan.go",
"record.go",
"utils.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery",
Expand All @@ -19,6 +20,7 @@ go_library(
"//pkg/storage",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/protoutil",
"//pkg/util/uuid",
"@com_github_cockroachdb_errors//:errors",
],
Expand All @@ -44,6 +46,7 @@ go_test(
"//pkg/util/hlc",
"//pkg/util/keysutil",
"//pkg/util/leaktest",
"//pkg/util/timeutil",
"//pkg/util/uuid",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
Expand Down
Loading