From c6bc053a676ee8dceefeb072d0b17cc5f5df03c4 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 31 Aug 2020 17:58:51 -0400 Subject: [PATCH] server: busy loop through resolver list during join process Deferred doing this in #52526. Probably a good idea to do have it, it'll bring down the cluster convergence time (time taken for all nodes to find out about the initialization) by a bit. Release justification: low risk, high benefit changes to existing functionality Release note: None --- pkg/server/init.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pkg/server/init.go b/pkg/server/init.go index d2932c8f595e..f915b81f7657 100644 --- a/pkg/server/init.go +++ b/pkg/server/init.go @@ -406,6 +406,39 @@ func (s *initServer) startJoinLoop(ctx context.Context, stopper *stop.Stopper) ( return nil, errJoinRPCUnsupported } + // Iterate through all the resolvers at least once to reduce time taken to + // cluster convergence. Keep this code block roughly in sync with the one + // below. + for _, res := range s.config.resolvers { + select { + case <-ctx.Done(): + return nil, context.Canceled + case <-stopper.ShouldQuiesce(): + return nil, stop.ErrUnavailable + default: + } + + addr := res.Addr() + state, err := s.attemptJoinTo(ctx, res.Addr()) + if err == nil { + return state, nil + } + + if errors.Is(err, errJoinRPCUnsupported) || errors.Is(err, ErrIncompatibleBinaryVersion) { + // Propagate upwards; these are error conditions the caller knows to + // expect. + return nil, err + } + + if IsWaitingForInit(err) { + log.Warningf(ctx, "%s is itself waiting for init, will retry", addr) + } else { + log.Warningf(ctx, "outgoing join rpc to %s unsuccessful: %v", addr, err.Error()) + } + + // Try the next node if unsuccessful. + } + const joinRPCBackoff = time.Second var tickChan <-chan time.Time {