From d864e340bfe050144252db8b0de9c66a3a40fa20 Mon Sep 17 00:00:00 2001
From: Leandro Motta Barros <lmb@stackedboxes.org>
Date: Wed, 3 May 2023 09:27:45 -0300
Subject: [PATCH] Further improve resilience of image pulls

Previously, we'd time-out a pull operation after unsuccessfully retrying
for about 17 minutes (for details see commit
582487f832c59c2f734a780ab0492833f29002c9). This worked fine in most
situations, but we have seen at least one case of a particularly
unreliable network connection that would not be able to finish a large
pull when operating under this policy.

This commit:

* Completely removes timeouts from image pulls. We'll keep retrying
  forever, or until users cancel the pull. (From the perspective of the
  REST API, "canceling" mean closing the HTTP connection. This is what
  happens when a user Ctrl+C during a pull in the CLI, or, say, we kill
  a curl process that was using the REST API to pull image.)
* Still uses exponential back-off, but we now limit the interval between
  retries to 5 minutes. The rationale is that some very unreliable
  networks may be up only for relatively small time windows. Therefore,
  using intervals that are too long would increase the risk of missing
  these windows.
* Tries to avoid flooding the log stream with messages about retries.
  We'll log every retry attempt up to the 10th. After that, we'll log
  retries only once about every 2h. This retry count is reset every time
  we successfully download any amount of data.

Signed-off-by: Leandro Motta Barros <leandro@balena.io>
Change-type: patch
---
 distribution/pull_v2.go | 53 +++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/distribution/pull_v2.go b/distribution/pull_v2.go
index 41e8f2fab2..7d483d8079 100644
--- a/distribution/pull_v2.go
+++ b/distribution/pull_v2.go
@@ -5,7 +5,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"math"
 	"net/url"
 	"os"
 	"runtime"
@@ -45,8 +44,6 @@ var (
 	errRootFSInvalid  = errors.New("invalid rootfs in image configuration")
 )
 
-const maxDownloadAttempts = 10
-
 // ImageConfigPullError is an error pulling the image config blob
 // (only applies to schema2).
 type ImageConfigPullError struct {
@@ -155,7 +152,7 @@ type v2LayerDescriptor struct {
 	src               distribution.Descriptor
 	ctx               context.Context
 	layerDownload     io.ReadCloser
-	downloadRetries   uint8 // Number of retries since last successful download attempt
+	downloadRetries   int // Number of retries since last successful download attempt
 	downloadOffset    int64
 	deltaBase         io.ReadSeeker
 	progressOutput    progress.Output
@@ -197,18 +194,19 @@ func (ld *v2LayerDescriptor) reset() error {
 }
 
 func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {
-	if ld.downloadRetries > maxDownloadAttempts {
-		logrus.Warnf("giving up layer download after %v retries", maxDownloadAttempts)
-		return 0, fmt.Errorf("no request retries left")
-	}
-
 	if ld.downloadRetries > 0 {
-		sleepDurationInSecs := int(math.Pow(2, float64(ld.downloadRetries-1)))
-		logrus.Infof("waiting %vs before retrying layer download", sleepDurationInSecs)
+		sleepDurationInSecs := 5 * 60 // max sleep duration
+		if ld.downloadRetries <= 8 {
+			sleepDurationInSecs = 1 << ld.downloadRetries
+		}
+
+		logDownloadRetry(ld.downloadRetries, "waiting %vs before retrying layer download", sleepDurationInSecs)
+
 		for sleepDurationInSecs > 0 {
 			if ld.ctx.Err() == context.Canceled {
-				// Stop the pull immediately on context cancellation (caused
-				// e.g. by Ctrl+C).
+				// Stop the pull immediately on context cancelation, caused
+				// when the HTTP connection of the REST API is closed (e.g. by
+				// Ctrl+C in the CLI or killing curl in a direct API call).
 				logrus.Info("context canceled during wait, interrupting layer download")
 				return 0, context.Canceled
 			}
@@ -221,9 +219,9 @@ func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {
 
 	if ld.layerDownload == nil {
 		if err := ld.reset(); err != nil {
-			logrus.Infof("failed to reset layer download: %v", err)
 			ld.downloadRetries += 1
-			// Don't report this as an error, as we might still want to retry
+			logDownloadRetry(ld.downloadRetries, "failed to reset layer download: %v", err)
+			// Don't report this as an error, because we want to keep retrying.
 			return 0, nil
 		}
 	}
@@ -232,9 +230,6 @@ func (ld *v2LayerDescriptor) Read(p []byte) (int, error) {
 	ld.downloadOffset += int64(n)
 	switch err {
 	case nil:
-		// We want to give up only after a long period failing to download
-		// anything at all. So, we reset the retries counter after every bit
-		// successfully downloaded.
 		if ld.downloadRetries > 0 {
 			logrus.Infof("download resumed after %v retries", ld.downloadRetries)
 			ld.downloadRetries = 0
@@ -1078,3 +1073,25 @@ func TargetImageConfig(imgConfig *container.Config) ([]byte, bool) {
 	}
 	return []byte{}, false
 }
+
+// logDownloadRetry logs a message related with retrying a download. Initially,
+// it logs every time it is called, but after a certain number of retries it
+// enters a regime of reduced logging frequency to avoid flooding the log stream
+// and filling up the often-limited log buffer of IoT/edge devices.
+func logDownloadRetry(retries int, format string, args ...interface{}) {
+	// We'll log every retry attempt up to this many retries; after that we
+	// enter in the reduced logging regime.
+	const logEveryRetryLimit = 10
+
+	// When in the reduced log regime, we'll log only every this many retries.
+	const reducedLogInterval = 25
+
+	switch {
+	case retries < logEveryRetryLimit:
+		logrus.Infof(format, args...)
+	case retries == logEveryRetryLimit:
+		logrus.Infof(format+" (and reducing log frequency from now on)", args...)
+	case (retries-logEveryRetryLimit)%reducedLogInterval == 0:
+		logrus.Infof(format+" (logging with reduced frequency)", args...)
+	}
+}