Skip to content

Commit

Permalink
Further improve resilience of image pulls
Browse files Browse the repository at this point in the history
Previously, we'd time-out a pull operation after unsuccessfully retrying
for about 17 minutes (for details see commit
582487f). This worked fine in most
situations, but we have seen at least one case of a particularly
unreliable network connection that would not be able to finish a large
pull when operating under this policy.

This commit:

* Completely removes timeouts from image pulls. We'll keep retrying
  forever, or until users cancel the pull. (From the perspective of the
  REST API, "canceling" mean closing the HTTP connection. This is what
  happens when a user Ctrl+C during a pull in the CLI, or, say, we kill
  a curl process that was using the REST API to pull image.)
* Still uses exponential back-off, but we now limit the interval between
  retries to 5 minutes. The rationale is that some very unreliable
  networks may be up only for relatively small time windows. Therefore,
  using intervals that are too long would increase the risk of missing
  these windows.
* Tries to avoid flooding the log stream with messages about retries.
  We'll log every retry attempt up to the 10th. After that, we'll log
  retries only once about every 2h. This retry count is reset every time
  we successfully download any amount of data.

Signed-off-by: Leandro Motta Barros <[email protected]>
Change-type: patch
  • Loading branch information
lmbarros committed May 3, 2023
1 parent efb7ea6 commit d864e34
Showing 1 changed file with 35 additions and 18 deletions.
53 changes: 35 additions & 18 deletions distribution/pull_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"encoding/json"
"fmt"
"io"
"math"
"net/url"
"os"
"runtime"
Expand Down Expand Up @@ -45,8 +44,6 @@ var (
errRootFSInvalid = errors.New("invalid rootfs in image configuration")
)

const maxDownloadAttempts = 10

// ImageConfigPullError is an error pulling the image config blob
// (only applies to schema2).
type ImageConfigPullError struct {
Expand Down Expand Up @@ -155,7 +152,7 @@ type v2LayerDescriptor struct {
src distribution.Descriptor
ctx context.Context
layerDownload io.ReadCloser
downloadRetries uint8 // Number of retries since last successful download attempt
downloadRetries int // Number of retries since last successful download attempt
downloadOffset int64
deltaBase io.ReadSeeker
progressOutput progress.Output
Expand Down Expand Up @@ -197,18 +194,19 @@ func (ld *v2LayerDescriptor) reset() error {
}

func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {
if ld.downloadRetries > maxDownloadAttempts {
logrus.Warnf("giving up layer download after %v retries", maxDownloadAttempts)
return 0, fmt.Errorf("no request retries left")
}

if ld.downloadRetries > 0 {
sleepDurationInSecs := int(math.Pow(2, float64(ld.downloadRetries-1)))
logrus.Infof("waiting %vs before retrying layer download", sleepDurationInSecs)
sleepDurationInSecs := 5 * 60 // max sleep duration
if ld.downloadRetries <= 8 {
sleepDurationInSecs = 1 << ld.downloadRetries
}

logDownloadRetry(ld.downloadRetries, "waiting %vs before retrying layer download", sleepDurationInSecs)

for sleepDurationInSecs > 0 {
if ld.ctx.Err() == context.Canceled {
// Stop the pull immediately on context cancellation (caused
// e.g. by Ctrl+C).
// Stop the pull immediately on context cancelation, caused
// when the HTTP connection of the REST API is closed (e.g. by
// Ctrl+C in the CLI or killing curl in a direct API call).
logrus.Info("context canceled during wait, interrupting layer download")
return 0, context.Canceled
}
Expand All @@ -221,9 +219,9 @@ func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {

if ld.layerDownload == nil {
if err := ld.reset(); err != nil {
logrus.Infof("failed to reset layer download: %v", err)
ld.downloadRetries += 1
// Don't report this as an error, as we might still want to retry
logDownloadRetry(ld.downloadRetries, "failed to reset layer download: %v", err)
// Don't report this as an error, because we want to keep retrying.
return 0, nil
}
}
Expand All @@ -232,9 +230,6 @@ func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {
ld.downloadOffset += int64(n)
switch err {
case nil:
// We want to give up only after a long period failing to download
// anything at all. So, we reset the retries counter after every bit
// successfully downloaded.
if ld.downloadRetries > 0 {
logrus.Infof("download resumed after %v retries", ld.downloadRetries)
ld.downloadRetries = 0
Expand Down Expand Up @@ -1078,3 +1073,25 @@ func TargetImageConfig(imgConfig *container.Config) ([]byte, bool) {
}
return []byte{}, false
}

// logDownloadRetry logs a message related with retrying a download. Initially,
// it logs every time it is called, but after a certain number of retries it
// enters a regime of reduced logging frequency to avoid flooding the log stream
// and filling up the often-limited log buffer of IoT/edge devices.
func logDownloadRetry(retries int, format string, args ...interface{}) {
// We'll log every retry attempt up to this many retries; after that we
// enter in the reduced logging regime.
const logEveryRetryLimit = 10

// When in the reduced log regime, we'll log only every this many retries.
const reducedLogInterval = 25

switch {
case retries < logEveryRetryLimit:
logrus.Infof(format, args...)
case retries == logEveryRetryLimit:
logrus.Infof(format+" (and reducing log frequency from now on)", args...)
case (retries-logEveryRetryLimit)%reducedLogInterval == 0:
logrus.Infof(format+" (logging with reduced frequency)", args...)
}
}

0 comments on commit d864e34

Please sign in to comment.