Skip to content

Commit

Permalink
wait for etcd to be ready before continuing with etcd api call (garde…
Browse files Browse the repository at this point in the history
…ner#628)

* wait for etcd to be ready before continuing with memberControl object creation

* address review comments
  • Loading branch information
aaronfern authored and abdasgupta committed May 25, 2023
1 parent 91df6e3 commit bca2c9e
Showing 1 changed file with 44 additions and 16 deletions.
60 changes: 44 additions & 16 deletions pkg/server/backuprestoreserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,48 @@ func (b *BackupRestoreServer) startHTTPServer(initializer initializer.Initialize
return handler
}

func waitUntilEtcdRunning(ctx context.Context, etcdConnectionConfig *brtypes.EtcdConnectionConfig, logger *logrus.Logger) error {
ticker := time.NewTicker(4 * time.Second)
defer ticker.Stop()
logger.Info("Checking if etcd is running")
for !isEtcdRunning(ctx, 2*time.Second, etcdConnectionConfig, logger) {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
}
}
logger.Info("Etcd is now running. Continuing br startup")
return nil
}

func isEtcdRunning(ctx context.Context, timeout time.Duration, etcdConnectionConfig *brtypes.EtcdConnectionConfig, logger *logrus.Logger) bool {
factory := etcdutil.NewFactory(*etcdConnectionConfig)
client, err := factory.NewMaintenance()
if err != nil {
logger.Errorf("failed to create etcd maintenance client: %v", err)
return false
}
defer client.Close()

if len(etcdConnectionConfig.Endpoints) == 0 {
logger.Errorf("etcd endpoints are not passed correctly")
return false
}

endpoint := etcdConnectionConfig.Endpoints[0]

ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

_, err = client.Status(ctx, endpoint)
if err != nil {
logger.Errorf("failed to get status of etcd endPoint: %v with error: %v", endpoint, err)
return false
}
return true
}

// runServer runs the etcd-backup-restore server according to snapstore provider configuration.
func (b *BackupRestoreServer) runServer(ctx context.Context, restoreOpts *brtypes.RestoreOptions) error {
var (
Expand All @@ -173,23 +215,9 @@ func (b *BackupRestoreServer) runServer(ctx context.Context, restoreOpts *brtype
defer handler.Stop()

metrics.CurrentClusterSize.With(prometheus.Labels{}).Set(float64(restoreOpts.OriginalClusterSize))
// Promotes member if it is a learner

if restoreOpts.OriginalClusterSize > 1 {
for {
select {
case <-ctx.Done():
b.logger.Info("Context cancelled. Stopping retry promoting member")
return ctx.Err()
default:
}
m := member.NewMemberControl(b.config.EtcdConnectionConfig)
err := m.PromoteMember(ctx)
if err == nil {
break
}
_ = miscellaneous.SleepWithContext(ctx, retryTimeout)
}
if err := waitUntilEtcdRunning(ctx, b.config.EtcdConnectionConfig, b.logger.Logger); err != nil {
return err
}

m := member.NewMemberControl(b.config.EtcdConnectionConfig)
Expand Down

0 comments on commit bca2c9e

Please sign in to comment.