Skip to content

Commit

Permalink
fix(restore): Do not retry restore proposal (#8058)
Browse files Browse the repository at this point in the history
Do not retry the restore proposal. It can cause issues in
the edge case scenarios. Consider the following scenario:
  1. alpha-2 gets the restore request (leader is alpha-0)
  2. alpha-2 sends the request to alpha-0 (leader).
  3. alpha-0 called proposeAndWait which proposed the req
      (index 24) at time=15:56:10
  4. alpha-0 was still waiting for the proposal to be applied
      and RPC call for `Restore` by alpha-2 got "transport
      closing error" at time=15:59:08
  5. transport closing is a retriable error, so alpha-2 again
      tried to proposeoOrSend, this time leader was alpha-1,
      so it sent it to alpha-1
  6. alpha-1 proposed the restore request (index 28) at time=15:59:09
  • Loading branch information
ahsanbarkati authored and mangalaman93 committed Oct 17, 2023
1 parent 283fe63 commit 91ac920
Showing 1 changed file with 1 addition and 37 deletions.
38 changes: 1 addition & 37 deletions worker/online_restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"path/filepath"
"strings"
"sync"
"time"

"github.com/golang/glog"
"github.com/golang/protobuf/proto"
Expand Down Expand Up @@ -149,7 +148,7 @@ func ProcessRestoreRequest(ctx context.Context, req *pb.RestoreRequest, wg *sync
reqCopy.GroupId = gid
wg.Add(1)
go func() {
errCh <- tryRestoreProposal(ctx, reqCopy)
errCh <- proposeRestoreOrSend(ctx, reqCopy)
}()
}

Expand Down Expand Up @@ -182,41 +181,6 @@ func proposeRestoreOrSend(ctx context.Context, req *pb.RestoreRequest) error {
return err
}

func retriableRestoreError(err error) bool {
switch {
case err == conn.ErrNoConnection:
// Try to recover from temporary connection issues.
return true
case strings.Contains(err.Error(), "Raft isn't initialized yet"):
// Try to recover if raft has not been initialized.
return true
case strings.Contains(err.Error(), errRestoreProposal):
// Do not try to recover from other errors when sending the proposal.
return false
default:
// Try to recover from other errors (e.g wrong group, waiting for timestamp, etc).
return true
}
}

func tryRestoreProposal(ctx context.Context, req *pb.RestoreRequest) error {
var err error
for i := 0; i < 10; i++ {
err = proposeRestoreOrSend(ctx, req)
if err == nil {
glog.Info("proposeRestoreOrSend done.")
return nil
}
glog.Errorf("Got error while proposeRestoreOrSend: %v, retrying!", err)
if retriableRestoreError(err) {
time.Sleep(time.Second)
continue
}
return err
}
return err
}

// Restore implements the Worker interface.
func (w *grpcWorker) Restore(ctx context.Context, req *pb.RestoreRequest) (*pb.Status, error) {
var emptyRes pb.Status
Expand Down

0 comments on commit 91ac920

Please sign in to comment.