diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index 77819afe..434d2e11 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -473,7 +473,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( case ScaleUpLock: operationComplete, nextClusterOperation, err = handleManagedCloudScaleUp(ctx, r, instance, statefulSet, clusterOp, podList, logger) case BalanceReplicasLock: - operationComplete, requestInProgress, err = util.BalanceReplicasForCluster(ctx, instance, statefulSet, clusterOp.Metadata, clusterOp.Metadata, logger) + operationComplete, requestInProgress, retryLaterDuration, err = util.BalanceReplicasForCluster(ctx, instance, statefulSet, clusterOp.Metadata, clusterOp.Metadata, logger) default: operationFound = false // This shouldn't happen, but we don't want to be stuck if it does. diff --git a/controllers/util/solr_scale_util.go b/controllers/util/solr_scale_util.go index a88531df..550bd158 100644 --- a/controllers/util/solr_scale_util.go +++ b/controllers/util/solr_scale_util.go @@ -23,6 +23,7 @@ import ( "github.com/apache/solr-operator/controllers/util/solr_api" "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" + "time" ) // BalanceReplicasForCluster takes a SolrCloud and balances all replicas across the Pods that are currently alive. @@ -31,7 +32,7 @@ import ( // a successful status returned from the command. So if we delete the asyncStatus, and then something happens in the operator, // and we lose our state, then we will need to retry the balanceReplicas command. This should be ok since calling // balanceReplicas multiple times should not be bad when the replicas for the cluster are already balanced. -func BalanceReplicasForCluster(ctx context.Context, solrCloud *solr.SolrCloud, statefulSet *appsv1.StatefulSet, balanceReason string, balanceCmdUniqueId string, logger logr.Logger) (balanceComplete bool, requestInProgress bool, err error) { +func BalanceReplicasForCluster(ctx context.Context, solrCloud *solr.SolrCloud, statefulSet *appsv1.StatefulSet, balanceReason string, balanceCmdUniqueId string, logger logr.Logger) (balanceComplete bool, requestInProgress bool, retryLaterDuration time.Duration, err error) { logger = logger.WithValues("balanceReason", balanceReason) // If the Cloud has 1 or zero pods, there is no reason to balance replicas. if statefulSet.Spec.Replicas == nil || *statefulSet.Spec.Replicas < 1 { @@ -96,5 +97,8 @@ func BalanceReplicasForCluster(ctx context.Context, solrCloud *solr.SolrCloud, s } } } + if requestInProgress && !balanceComplete { + retryLaterDuration = time.Second * 5 + } return }