Skip to content

Commit

Permalink
Lock bootstrap data with empty key to prevent conflicts
Browse files Browse the repository at this point in the history
Signed-off-by: Brad Davidson <[email protected]>
  • Loading branch information
brandond committed Apr 5, 2023
1 parent 2992477 commit ae574ca
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 47 deletions.
34 changes: 10 additions & 24 deletions pkg/cluster/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,31 +273,17 @@ func (c *Cluster) ReconcileBootstrapData(ctx context.Context, buf io.ReadSeeker,
}
defer storageClient.Close()

ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()

RETRY:
for {
value, c.saveBootstrap, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
if strings.Contains(err.Error(), "not supported for learner") {
for range ticker.C {
continue RETRY
}

}
return err
}
if value == nil {
return nil
}

dbRawData, err = decrypt(normalizedToken, value.Data)
if err != nil {
return err
}
value, c.saveBootstrap, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
return err
}
if value == nil {
return nil
}

break
dbRawData, err = decrypt(normalizedToken, value.Data)
if err != nil {
return err
}

buf = bytes.NewReader(dbRawData)
Expand Down
92 changes: 70 additions & 22 deletions pkg/cluster/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ import (
"errors"
"os"
"path/filepath"
"strings"
"time"

"github.com/k3s-io/k3s/pkg/bootstrap"
"github.com/k3s-io/k3s/pkg/clientaccess"
"github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/kine/pkg/client"
"github.com/sirupsen/logrus"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"k8s.io/apimachinery/pkg/util/wait"
)

// Save writes the current ControlRuntimeBootstrap data to the datastore. This contains a complete
Expand Down Expand Up @@ -48,22 +50,29 @@ func Save(ctx context.Context, config *config.Control, override bool) error {
}
defer storageClient.Close()

if _, _, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token); err != nil {
currentKey, _, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
if err != nil {
return err
}

// If there's an empty bootstrap key, then we've locked it and can override.
if currentKey != nil && len(currentKey.Data) == 0 {
override = true
}

if err := storageClient.Create(ctx, storageKey(normalizedToken), data); err != nil {
if err.Error() == "key exists" {
logrus.Warn("bootstrap key already exists")
if override {
bsd, err := bootstrapKeyData(ctx, storageClient)
if err != nil {
return err
}
return storageClient.Update(ctx, storageKey(normalizedToken), bsd.Modified, data)
} else {
logrus.Warn("bootstrap key already exists")
}
return nil
} else if strings.Contains(err.Error(), "not supported for learner") {
} else if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
logrus.Debug("skipping bootstrap data save on learner")
return nil
}
Expand Down Expand Up @@ -110,7 +119,12 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error {
return err
}
if tokenFromFile == "" {
// at this point this is a fresh start in a non managed environment
// No token on disk or from CLI, but we don't know if there's data in the datastore.
// Return here and generate new CA certs and tokens. Note that startup will fail
// later when saving to the datastore if there's already a bootstrap key - but
// that's AFTER generating CA certs and tokens. If the config is updated to set the
// matching key, further startups will still be blocked pending cleanup of the
// "newer" files as per the bootstrap reconciliation code.
c.saveBootstrap = true
return nil
}
Expand All @@ -121,34 +135,68 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error {
return err
}

value, saveBootstrap, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
c.saveBootstrap = saveBootstrap
if err != nil {
return err
}
if value == nil {
return nil
}
tokenKey := storageKey(normalizedToken)
return wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
value, saveBootstrap, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
c.saveBootstrap = saveBootstrap
if err != nil {
return false, err
}
if value == nil {
// No bootstrap keys found in the datastore - create an empty bootstrap key as a lock to
// ensure that no other node races us to populate it. If we fail to create the key, then
// some other node beat us to it and we should just wait for them to finish.
if err := storageClient.Create(ctx, tokenKey, []byte{}); err != nil {
if err.Error() == "key exists" {
logrus.Infof("Failed to create empty bootstrap key - waiting for data to be populated by another server")
return false, nil
}
return false, err
}
// successfully locked the bootstrap key, return now and generate CA certs and tokens
return true, nil
}

data, err := decrypt(normalizedToken, value.Data)
if err != nil {
return err
}
if len(value.Data) == 0 {
// Some other node has created an empty bootstrap key, wait for it to be populated.
logrus.Infof("Found empty bootstrap key - waiting for data to be populated by another server")
return false, nil
}

return c.ReconcileBootstrapData(ctx, bytes.NewReader(data), &c.config.Runtime.ControlRuntimeBootstrap, false)
data, err := decrypt(normalizedToken, value.Data)
if err != nil {
return false, err
}

return true, c.ReconcileBootstrapData(ctx, bytes.NewReader(data), &c.config.Runtime.ControlRuntimeBootstrap, false)
})
}

// getBootstrapKeyFromStorage will list all keys that has prefix /bootstrap and will check for key that is
// hashed with empty string and will check for any key that is hashed by different token than the one
// passed to it, it will return error if it finds a key that is hashed with different token and will return
// value if it finds the key hashed by passed token or empty string
// value if it finds the key hashed by passed token or empty string.
// Upon receiving a "not supported for learner" error from etcd, this function will retry until the context is cancelled.
func getBootstrapKeyFromStorage(ctx context.Context, storageClient client.Client, normalizedToken, oldToken string) (*client.Value, bool, error) {
emptyStringKey := storageKey("")
tokenKey := storageKey(normalizedToken)
bootstrapList, err := storageClient.List(ctx, "/bootstrap", 0)
if err != nil {

var bootstrapList []client.Value
var err error

if err := wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
bootstrapList, err = storageClient.List(ctx, "/bootstrap", 0)
if err != nil {
if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
return false, nil
}
return false, err
}
return true, nil
}); err != nil {
return nil, false, err
}

if len(bootstrapList) == 0 {
return nil, true, nil
}
Expand Down Expand Up @@ -248,7 +296,7 @@ func doMigrateToken(ctx context.Context, storageClient client.Client, keyValue c
if err := storageClient.Create(ctx, newTokenKey, encryptedData); err != nil {
if err.Error() == "key exists" {
logrus.Warn("bootstrap key exists")
} else if strings.Contains(err.Error(), "not supported for learner") {
} else if errors.Is(err, rpctypes.ErrGPRCNotSupportedForLearner) {
logrus.Debug("skipping bootstrap data save on learner")
return nil
} else {
Expand Down
2 changes: 1 addition & 1 deletion pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ func (e *ETCD) RemovePeer(ctx context.Context, name, address string, allowSelfRe
}
logrus.Infof("Removing name=%s id=%d address=%s from etcd", member.Name, member.ID, address)
_, err := e.client.MemberRemove(ctx, member.ID)
if err == rpctypes.ErrGRPCMemberNotFound {
if errors.Is(err, rpctypes.ErrGRPCMemberNotFound) {
return nil
}
return err
Expand Down

0 comments on commit ae574ca

Please sign in to comment.