Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lock bootstrap data with empty key to prevent conflicts #7215

Merged
merged 1 commit into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 10 additions & 24 deletions pkg/cluster/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,31 +273,17 @@ func (c *Cluster) ReconcileBootstrapData(ctx context.Context, buf io.ReadSeeker,
}
defer storageClient.Close()

ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()

RETRY:
for {
value, c.saveBootstrap, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
if strings.Contains(err.Error(), "not supported for learner") {
for range ticker.C {
continue RETRY
}

}
return err
}
if value == nil {
return nil
}

dbRawData, err = decrypt(normalizedToken, value.Data)
if err != nil {
return err
}
Comment on lines -276 to -298
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this whole ugly for/continue loop into a wait.PollImmediateUntilWithContext in the getBootstrapKeyFromStorage function itself.

value, c.saveBootstrap, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
return err
}
if value == nil {
return nil
}

break
dbRawData, err = decrypt(normalizedToken, value.Data)
if err != nil {
return err
}

buf = bytes.NewReader(dbRawData)
Expand Down
121 changes: 95 additions & 26 deletions pkg/cluster/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,27 @@ import (
"errors"
"os"
"path/filepath"
"strings"
"time"

"github.com/k3s-io/k3s/pkg/bootstrap"
"github.com/k3s-io/k3s/pkg/clientaccess"
"github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/kine/pkg/client"
"github.com/sirupsen/logrus"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"k8s.io/apimachinery/pkg/util/wait"
)

// maxBootstrapWaitAttempts is the number of iterations to wait for another node to populate an empty bootstrap key.
// After this many attempts, the lock is deleted and the counter reset.
const maxBootstrapWaitAttempts = 5

// Save writes the current ControlRuntimeBootstrap data to the datastore. This contains a complete
// snapshot of the cluster's CA certs and keys, encryption passphrases, etc - encrypted with the join token.
// This is used when bootstrapping a cluster from a managed database or external etcd cluster.
// This is NOT used with embedded etcd, which bootstraps over HTTP.
func Save(ctx context.Context, config *config.Control, override bool) error {
logrus.Info("Saving cluster bootstrap data to datastore")
buf := &bytes.Buffer{}
if err := bootstrap.ReadFromDisk(buf, &config.Runtime.ControlRuntimeBootstrap); err != nil {
return err
Expand Down Expand Up @@ -48,23 +55,30 @@ func Save(ctx context.Context, config *config.Control, override bool) error {
}
defer storageClient.Close()

if _, _, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token); err != nil {
currentKey, _, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
return err
}

// If there's an empty bootstrap key, then we've locked it and can override.
if currentKey != nil && len(currentKey.Data) == 0 {
logrus.Info("Bootstrap key lock is held")
override = true
}

if err := storageClient.Create(ctx, storageKey(normalizedToken), data); err != nil {
if err.Error() == "key exists" {
logrus.Warn("bootstrap key already exists")
if override {
bsd, err := bootstrapKeyData(ctx, storageClient)
if err != nil {
return err
}
return storageClient.Update(ctx, storageKey(normalizedToken), bsd.Modified, data)
}
logrus.Warn("Bootstrap key already exists")
return nil
} else if strings.Contains(err.Error(), "not supported for learner") {
logrus.Debug("skipping bootstrap data save on learner")
} else if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
logrus.Debug("Skipping bootstrap data save on learner")
return nil
}
return err
Expand All @@ -89,9 +103,12 @@ func bootstrapKeyData(ctx context.Context, storageClient client.Client) (*client
return &bootstrapList[0], nil
}

// storageBootstrap loads data from the datastore into the ControlRuntimeBootstrap struct.
// The storage key and encryption passphrase are both derived from the join token.
// token is either passed.
// storageBootstrap loads data from the datastore's bootstrap key into the
// ControlRuntimeBootstrap struct. The storage key and encryption passphrase are both derived
// from the join token. If no bootstrap key exists, indicating that data needs to be written
// back to the datastore, this function will set c.saveBootstrap to true and create an empty
// bootstrap key as a lock. This function will not return successfully until either the
// bootstrap key has been locked, or data is read into the struct.
func (c *Cluster) storageBootstrap(ctx context.Context) error {
if err := c.startStorage(ctx); err != nil {
return err
Expand All @@ -110,7 +127,12 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error {
return err
}
if tokenFromFile == "" {
// at this point this is a fresh start in a non managed environment
// No token on disk or from CLI, but we don't know if there's data in the datastore.
// Return here and generate new CA certs and tokens. Note that startup will fail
// later when saving to the datastore if there's already a bootstrap key - but
// that's AFTER generating CA certs and tokens. If the config is updated to set the
// matching key, further startups will still be blocked pending cleanup of the
// "newer" files as per the bootstrap reconciliation code.
c.saveBootstrap = true
return nil
}
Expand All @@ -121,34 +143,81 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error {
return err
}

value, saveBootstrap, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
c.saveBootstrap = saveBootstrap
if err != nil {
return err
}
if value == nil {
return nil
}
attempts := 0
tokenKey := storageKey(normalizedToken)
return wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
attempts++
value, saveBootstrap, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
c.saveBootstrap = saveBootstrap
if err != nil {
return false, err
}

data, err := decrypt(normalizedToken, value.Data)
if err != nil {
return err
}
if value == nil {
// No bootstrap keys found in the datastore - create an empty bootstrap key as a lock to
// ensure that no other node races us to populate it. If we fail to create the key, then
// some other node beat us to it and we should just wait for them to finish.
if err := storageClient.Create(ctx, tokenKey, []byte{}); err != nil {
if err.Error() == "key exists" {
logrus.Info("Bootstrap key already locked - waiting for data to be populated by another server")
return false, nil
}
return false, err
}
logrus.Info("Bootstrap key locked for initial create")
return true, nil
}

if len(value.Data) == 0 {
// Empty (locked) bootstrap key found - check to see if we should continue waiting, or
// delete it and attempt to retake the lock on the next iteration (assuming that the
// other node failed while holding the lock).
if attempts >= maxBootstrapWaitAttempts {
logrus.Info("Bootstrap key lock timed out - deleting lock and retrying")
attempts = 0
if err := storageClient.Delete(ctx, tokenKey, value.Modified); err != nil {
return false, err
}
} else {
logrus.Infof("Bootstrap key is locked - waiting for data to be populated by another server")
}
return false, nil
}

data, err := decrypt(normalizedToken, value.Data)
if err != nil {
return false, err
}

return c.ReconcileBootstrapData(ctx, bytes.NewReader(data), &c.config.Runtime.ControlRuntimeBootstrap, false)
return true, c.ReconcileBootstrapData(ctx, bytes.NewReader(data), &c.config.Runtime.ControlRuntimeBootstrap, false)
})
}

// getBootstrapKeyFromStorage will list all keys that has prefix /bootstrap and will check for key that is
// hashed with empty string and will check for any key that is hashed by different token than the one
// passed to it, it will return error if it finds a key that is hashed with different token and will return
// value if it finds the key hashed by passed token or empty string
// value if it finds the key hashed by passed token or empty string.
// Upon receiving a "not supported for learner" error from etcd, this function will retry until the context is cancelled.
func getBootstrapKeyFromStorage(ctx context.Context, storageClient client.Client, normalizedToken, oldToken string) (*client.Value, bool, error) {
emptyStringKey := storageKey("")
tokenKey := storageKey(normalizedToken)
bootstrapList, err := storageClient.List(ctx, "/bootstrap", 0)
if err != nil {

var bootstrapList []client.Value
var err error

if err := wait.PollImmediateUntilWithContext(ctx, 5*time.Second, func(ctx context.Context) (bool, error) {
bootstrapList, err = storageClient.List(ctx, "/bootstrap", 0)
if err != nil {
if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
return false, nil
}
return false, err
}
return true, nil
}); err != nil {
return nil, false, err
}

if len(bootstrapList) == 0 {
return nil, true, nil
}
Expand Down Expand Up @@ -248,7 +317,7 @@ func doMigrateToken(ctx context.Context, storageClient client.Client, keyValue c
if err := storageClient.Create(ctx, newTokenKey, encryptedData); err != nil {
if err.Error() == "key exists" {
logrus.Warn("bootstrap key exists")
} else if strings.Contains(err.Error(), "not supported for learner") {
} else if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
logrus.Debug("skipping bootstrap data save on learner")
return nil
} else {
Expand Down
2 changes: 1 addition & 1 deletion pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ func (e *ETCD) RemovePeer(ctx context.Context, name, address string, allowSelfRe
}
logrus.Infof("Removing name=%s id=%d address=%s from etcd", member.Name, member.ID, address)
_, err := e.client.MemberRemove(ctx, member.ID)
if err == rpctypes.ErrGRPCMemberNotFound {
if errors.Is(err, rpctypes.ErrGRPCMemberNotFound) {
return nil
}
return err
Expand Down