Skip to content

Commit

Permalink
roachprod: retry GetInternalIP on error
Browse files Browse the repository at this point in the history
Epic: none
Fixes: #98285
Fixes: #98342
Release note: None
  • Loading branch information
Miral Gadani committed Mar 23, 2023
1 parent 4be5775 commit 9110629
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
19 changes: 14 additions & 5 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -1106,7 +1106,8 @@ tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys
ip, err = c.GetInternalIP(l, ctx, node)
if err != nil {
res.Err = errors.Wrapf(err, "pgurls")
return res, res.Err
// By returning a nil error here, we'll retry the command.
return res, nil
}
time.Sleep(time.Second)
}
Expand Down Expand Up @@ -1541,7 +1542,7 @@ func (c *SyncedCluster) createNodeCertArguments(

res.Stdout, res.Err = c.GetInternalIP(l, ctx, node)
ips[i] = res.Stdout
return res, errors.Wrapf(res.Err, "IPs")
return res, nil
}, DefaultSSHRetryOpts); err != nil {
return nil, err
}
Expand Down Expand Up @@ -2291,9 +2292,9 @@ func (c *SyncedCluster) pghosts(
res := &RunResultDetails{Node: node}
res.Stdout, res.Err = c.GetInternalIP(l, ctx, node)
ips[i] = res.Stdout
return res, errors.Wrapf(res.Err, "pghosts")
return res, nil
}, DefaultSSHRetryOpts); err != nil {
return nil, err
return nil, errors.Wrapf(err, "pghosts")
}

m := make(map[Node]string, len(ips))
Expand Down Expand Up @@ -2409,6 +2410,9 @@ type ParallelResult struct {
// cluster. If any of the commands fail, Parallel will log an error
// and exit the program.
//
// A user may also pass in a RunRetryOpts to control how the function is retried
// in the case of a failure.
//
// See ParallelE for more information.
func (c *SyncedCluster) Parallel(
l *logger.Logger,
Expand Down Expand Up @@ -2437,7 +2441,12 @@ func (c *SyncedCluster) Parallel(
// 0, then it defaults to `count`.
//
// The function returns a pointer to RunResultDetails as we may enrich
// the result with retry information (attempt number, wrapper error)
// the result with retry information (attempt number, wrapper error).
//
// RunRetryOpts controls the retry behavior in the case that
// the function fails, but returns a nil error. A non-nil error returned by the
// function denotes a roachprod error and will not be retried regardless of the
// retry options.
//
// If err is non-nil, the slice of ParallelResults will contain the
// results from any of the failed invocations.
Expand Down
6 changes: 2 additions & 4 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,13 +469,12 @@ func IP(
ips[i] = c.VMs[nodes[i]-1].PublicIP
}
} else {
var err error
if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*install.RunResultDetails, error) {
node := nodes[i]
res := &install.RunResultDetails{Node: node}
res.Stdout, res.Err = c.GetInternalIP(l, ctx, node)
ips[i] = res.Stdout
return res, err
return res, nil
}, install.DefaultSSHRetryOpts); err != nil {
return nil, err
}
Expand Down Expand Up @@ -895,13 +894,12 @@ func PgURL(
ips[i] = c.VMs[nodes[i]-1].PublicIP
}
} else {
var err error
if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*install.RunResultDetails, error) {
node := nodes[i]
res := &install.RunResultDetails{Node: node}
res.Stdout, res.Err = c.GetInternalIP(l, ctx, node)
ips[i] = res.Stdout
return res, err
return res, nil
}, install.DefaultSSHRetryOpts); err != nil {
return nil, err
}
Expand Down

0 comments on commit 9110629

Please sign in to comment.