From 56b5d37c27993805890bcf927eac667072a80c6a Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Thu, 7 Feb 2019 15:28:56 +0100 Subject: [PATCH 1/2] Remove internal retry loop We should use exponential backoff from the provisioner library. It will react more quickly when a PVC is deleted and it won't occupy a worker thread. --- pkg/controller/controller.go | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 3896aaaafe..f822eb3dcb 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -40,14 +40,11 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "google.golang.org/grpc" - "google.golang.org/grpc/codes" "google.golang.org/grpc/connectivity" - "google.golang.org/grpc/status" "github.com/container-storage-interface/spec/lib/go/csi" csiclientset "k8s.io/csi-api/pkg/client/clientset/versioned" @@ -566,26 +563,9 @@ func (p *csiProvisioner) Provision(options controller.VolumeOptions) (*v1.Persis return nil, fmt.Errorf("failed to strip CSI Parameters of prefixed keys: %v", err) } - opts := wait.Backoff{Duration: backoffDuration, Factor: backoffFactor, Steps: backoffSteps} - err = wait.ExponentialBackoff(opts, func() (bool, error) { - ctx, cancel := context.WithTimeout(context.Background(), p.timeout) - defer cancel() - rep, err = p.csiClient.CreateVolume(ctx, &req) - if err == nil { - // CreateVolume has finished successfully - return true, nil - } - - if status, ok := status.FromError(err); ok { - if status.Code() == codes.DeadlineExceeded { - // CreateVolume timed out, give it another chance to complete - klog.Warningf("CreateVolume timeout: %s has expired, operation will be retried", p.timeout.String()) - return false, nil - } - } - // CreateVolume failed , no reason to retry, bailing from ExponentialBackoff - return false, err - }) + ctx, cancel := context.WithTimeout(context.Background(), p.timeout) + defer cancel() + rep, err = p.csiClient.CreateVolume(ctx, &req) if err != nil { return nil, err From c489baccaf2d51f49168c10c6aa12bd087982bc6 Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Thu, 7 Feb 2019 16:05:42 +0100 Subject: [PATCH 2/2] Add parameters for provisioning/deletion backoff. --- cmd/csi-provisioner/csi-provisioner.go | 30 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/cmd/csi-provisioner/csi-provisioner.go b/cmd/csi-provisioner/csi-provisioner.go index 9998885c3e..a66b8479e2 100644 --- a/cmd/csi-provisioner/csi-provisioner.go +++ b/cmd/csi-provisioner/csi-provisioner.go @@ -39,22 +39,28 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/util/workqueue" utilfeature "k8s.io/apiserver/pkg/util/feature" utilflag "k8s.io/apiserver/pkg/util/flag" ) var ( - master = flag.String("master", "", "Master URL to build a client config from. Either this or kubeconfig needs to be set if the provisioner is being run out of cluster.") - kubeconfig = flag.String("kubeconfig", "", "Absolute path to the kubeconfig file. Either this or master needs to be set if the provisioner is being run out of cluster.") - csiEndpoint = flag.String("csi-address", "/run/csi/socket", "The gRPC endpoint for Target CSI Volume.") - connectionTimeout = flag.Duration("connection-timeout", 10*time.Second, "Timeout for waiting for CSI driver socket.") - volumeNamePrefix = flag.String("volume-name-prefix", "pvc", "Prefix to apply to the name of a created volume.") - volumeNameUUIDLength = flag.Int("volume-name-uuid-length", -1, "Truncates generated UUID of a created volume to this length. Defaults behavior is to NOT truncate.") - showVersion = flag.Bool("version", false, "Show version.") - enableLeaderElection = flag.Bool("enable-leader-election", false, "Enables leader election. If leader election is enabled, additional RBAC rules are required. Please refer to the Kubernetes CSI documentation for instructions on setting up these RBAC rules.") - featureGates map[string]bool - + master = flag.String("master", "", "Master URL to build a client config from. Either this or kubeconfig needs to be set if the provisioner is being run out of cluster.") + kubeconfig = flag.String("kubeconfig", "", "Absolute path to the kubeconfig file. Either this or master needs to be set if the provisioner is being run out of cluster.") + csiEndpoint = flag.String("csi-address", "/run/csi/socket", "The gRPC endpoint for Target CSI Volume.") + connectionTimeout = flag.Duration("connection-timeout", 10*time.Second, "Timeout for waiting for CSI driver socket.") + volumeNamePrefix = flag.String("volume-name-prefix", "pvc", "Prefix to apply to the name of a created volume.") + volumeNameUUIDLength = flag.Int("volume-name-uuid-length", -1, "Truncates generated UUID of a created volume to this length. Defaults behavior is to NOT truncate.") + showVersion = flag.Bool("version", false, "Show version.") + enableLeaderElection = flag.Bool("enable-leader-election", false, "Enables leader election. If leader election is enabled, additional RBAC rules are required. Please refer to the Kubernetes CSI documentation for instructions on setting up these RBAC rules.") + provisioningRetryCount = flag.Uint("provisioning-retry-count", 0, "Number of retries of failed provisioning. 0 = retry indefinitely.") + deletionRetryCount = flag.Uint("deletion-retry-count", 0, "Number of retries of failed volume deletion. 0 = retry indefinitely.") + retryIntervalStart = flag.Duration("retry-interval-start", time.Second, "Initial retry interval of failed provisioning or deletion. It doubles with each failure, up to retry-interval-max.") + retryIntervalMax = flag.Duration("retry-interval-max", 5*time.Minute, "Maximum retry interval of failed provisioning or deletion.") + workerThreads = flag.Uint("worker-threads", 100, "Number of provisioner worker threads, in other words nr. of simultaneous CSI calls.") + + featureGates map[string]bool provisionController *controller.ProvisionController version = "unknown" ) @@ -153,6 +159,10 @@ func init() { csiProvisioner, serverVersion.GitVersion, controller.LeaderElection(*enableLeaderElection), + controller.FailedProvisionThreshold(int(*provisioningRetryCount)), + controller.FailedDeleteThreshold(int(*deletionRetryCount)), + controller.RateLimiter(workqueue.NewItemExponentialFailureRateLimiter(*retryIntervalStart, *retryIntervalMax)), + controller.Threadiness(int(*workerThreads)), ) }