From fc6dcb43f9989ce114284e4649eade018f4e8827 Mon Sep 17 00:00:00 2001 From: Raghavendra Talur Date: Tue, 23 Jan 2024 11:16:01 -0500 Subject: [PATCH] Fix Failover Confusion in DRPC Action Post Hub Recovery If the DRPC Action is set to Failover, for instance, transitioning from C1 to C2, a subsequent failover request from C2 to C1 creates confusion in Ramen post hub recovery because the action didn't change and the only thing that changed is the destination cluster. The solution is to permit failover from C2 to C1 if C1 is accessible. Signed-off-by: Benamar Mekhissi --- controllers/drplacementcontrol.go | 1 + controllers/drplacementcontrol_controller.go | 103 +++++++++++------- .../drplacementcontrol_controller_test.go | 55 ++++++---- controllers/util/mcv_util.go | 5 +- 4 files changed, 96 insertions(+), 68 deletions(-) diff --git a/controllers/drplacementcontrol.go b/controllers/drplacementcontrol.go index e0eac5807..7c4f63a80 100644 --- a/controllers/drplacementcontrol.go +++ b/controllers/drplacementcontrol.go @@ -2335,6 +2335,7 @@ func (d *DRPCInstance) setConditionOnInitialDeploymentCompletion() { func (d *DRPCInstance) setStatusInitiating() { if !(d.instance.Status.Phase == "" || + d.instance.Status.Phase == rmn.WaitForUser || d.instance.Status.Phase == rmn.Deployed || d.instance.Status.Phase == rmn.FailedOver || d.instance.Status.Phase == rmn.Relocated) { diff --git a/controllers/drplacementcontrol_controller.go b/controllers/drplacementcontrol_controller.go index ede531b83..3ee5eec22 100644 --- a/controllers/drplacementcontrol_controller.go +++ b/controllers/drplacementcontrol_controller.go @@ -856,6 +856,8 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( placementObj client.Object, log logr.Logger, ) (*DRPCInstance, error) { + log.Info("Creating DRPC instance") + drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { return nil, err @@ -2168,8 +2170,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( ) (bool, error) { requeue := true + log.Info("Ensure DRPC Status Consistency") + // This will always be false the first time the DRPC resource is first created OR after hub recovery - if drpc.Status.Phase != "" { + if drpc.Status.Phase != "" && drpc.Status.Phase != rmn.WaitForUser { return !requeue, nil } @@ -2178,7 +2182,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( dstCluster = drpc.Spec.FailoverCluster } - progress, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + progress, msg, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + + log.Info(msg) + if err != nil { return requeue, err } @@ -2187,22 +2194,23 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( case Continue: return !requeue, nil case AllowFailover: + drpc.Status.Phase = rmn.WaitForUser updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, - drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") + drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") return requeue, nil default: - msg := "Operation Paused - User Intervention Required." + msg := fmt.Sprintf("Operation Paused - User Intervention Required. %s", msg) - log.Info(fmt.Sprintf("err:%v - msg:%s", err, msg)) + log.Info(msg) updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, drpc.Generation, metav1.ConditionFalse, rmn.ReasonPaused, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, - metav1.ConditionFalse, rmn.ReasonPaused, msg) + metav1.ConditionFalse, rmn.ReasonPaused, "User Intervention Required") return requeue, nil } @@ -2249,19 +2257,19 @@ func (r *DRPlacementControlReconciler) determineDRPCState( placementObj client.Object, dstCluster string, log logr.Logger, -) (Progress, error) { +) (Progress, string, error) { log.Info("Rebuild DRPC state") vrgNamespace, err := selectVRGNamespace(r.Client, log, drpc, placementObj) if err != nil { log.Info("Failed to select VRG namespace") - return Stop, err + return Stop, "", err } drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { - return Stop, err + return Stop, "", err } vrgs, successfullyQueriedClusterCount, failedCluster, err := getVRGsFromManagedClusters( @@ -2269,21 +2277,29 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if err != nil { log.Info("Failed to get a list of VRGs") - return Stop, err + return Stop, "", err } // IF 2 clusters queried, and both queries failed, then STOP if successfullyQueriedClusterCount == 0 { - log.Info("Number of clusters queried is 0. Stop...") + msg := "Stop - Number of clusters queried is 0" - return Stop, nil + return Stop, msg, nil } // IF 2 clusters queried successfully and no VRGs, then continue with initial deployment if successfullyQueriedClusterCount == 2 && len(vrgs) == 0 { log.Info("Queried 2 clusters successfully") - return Continue, nil + return Continue, "", nil + } + + if drpc.Status.Phase == rmn.WaitForUser && + drpc.Spec.Action == rmn.ActionFailover && + drpc.Spec.FailoverCluster != failedCluster { + log.Info("Continue. The action is failover and the failoverCluster is accessible") + + return Continue, "", nil } // IF queried 2 clusters queried, 1 failed and 0 VRG found, then check s3 store. @@ -2297,21 +2313,22 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if vrg == nil { // IF the failed cluster is not the dest cluster, then this could be an initial deploy if failedCluster != dstCluster { - return Continue, nil + return Continue, "", nil } - log.Info("Unable to query all clusters and failed to get VRG from s3 store") + msg := fmt.Sprintf("Unable to query all clusters and failed to get VRG from s3 store. Failed to query %s", + failedCluster) - return Stop, nil + return Stop, msg, nil } - log.Info("VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) + log.Info("Got VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Two different actions - drpc action is '%s'/vrg action from s3 is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + msg := fmt.Sprintf("Failover is allowed - Two different actions - drpcAction is '%s' and vrgAction from s3 is '%s'", + drpc.Spec.Action, vrg.Spec.Action) - return AllowFailover, nil + return AllowFailover, msg, nil } if dstCluster == vrg.GetAnnotations()[DestinationClusterAnnotationKey] && @@ -2319,13 +2336,13 @@ func (r *DRPlacementControlReconciler) determineDRPCState( log.Info(fmt.Sprintf("VRG from s3. Same dstCluster %s/%s. Proceeding...", dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) - return Continue, nil + return Continue, "", nil } - log.Info(fmt.Sprintf("VRG from s3. DRPCAction/vrgAction/DRPCDstClstr/vrgDstClstr %s/%s/%s/%s. Allow Failover...", - drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) + msg := fmt.Sprintf("Failover is allowed - drpcAction:'%s'. vrgAction:'%s'. DRPCDstClstr:'%s'. vrgDstClstr:'%s'.", + drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey]) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF 2 clusters queried, 1 failed and 1 VRG found on the failover cluster, then check the action, if they don't @@ -2341,25 +2358,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( break } - if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Stop! Two different actions - drpc action is '%s'/vrg action is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) && + dstCluster == clusterName { + msg := fmt.Sprintf("Stop - Two different actions for the same cluster - drpcAction:'%s'. vrgAction:'%s'", + drpc.Spec.Action, vrg.Spec.Action) - return Stop, nil + return Stop, msg, nil } if dstCluster != clusterName && vrg.Spec.ReplicationState == rmn.Secondary { - log.Info(fmt.Sprintf("Same Action and dstCluster and ReplicationState %s/%s/%s", + log.Info(fmt.Sprintf("Failover is allowed. Action/dstCluster/ReplicationState %s/%s/%s", drpc.Spec.Action, dstCluster, vrg.Spec.ReplicationState)) - log.Info("Failover is allowed - Primary is assumed in the failed cluster") + msg := "Failover is allowed - Primary is assumed to be on the failed cluster" - return AllowFailover, nil + return AllowFailover, msg, nil } - log.Info("Allow to continue") + log.Info("Same action, dstCluster, and ReplicationState is primary. Continuing") - return Continue, nil + return Continue, "", nil } // Finally, IF 2 clusters queried successfully and 1 or more VRGs found, and if one of the VRGs is on the dstCluster, @@ -2379,25 +2397,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( // This can happen if a hub is recovered in the middle of a Relocate if vrg.Spec.ReplicationState == rmn.Secondary && len(vrgs) == 2 { - log.Info("Both VRGs are in secondary state") + msg := "Stop - Both VRGs have the same secondary state" - return Stop, nil + return Stop, msg, nil } if drpc.Spec.Action == rmn.DRAction(vrg.Spec.Action) && dstCluster == clusterName { - log.Info(fmt.Sprintf("Same Action %s", drpc.Spec.Action)) + log.Info(fmt.Sprintf("Same Action and dest cluster %s/%s", drpc.Spec.Action, dstCluster)) - return Continue, nil + return Continue, "", nil } - log.Info("Failover is allowed", "vrgs count", len(vrgs), "drpc action", - drpc.Spec.Action, "vrg action", vrg.Spec.Action, "dstCluster/clusterName", dstCluster+"/"+clusterName) + msg := fmt.Sprintf("Failover is allowed - VRGs count:'%d'. drpcAction:'%s'."+ + " vrgAction:'%s'. DstCluster:'%s'. vrgOnCluste '%s'", + len(vrgs), drpc.Spec.Action, vrg.Spec.Action, dstCluster, clusterName) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF none of the above, then allow failover (set PeerReady), but stop until someone makes the change - log.Info("Failover is allowed, but user intervention is required") + msg := "Failover is allowed - User intervention is required" - return AllowFailover, nil + return AllowFailover, msg, nil } diff --git a/controllers/drplacementcontrol_controller_test.go b/controllers/drplacementcontrol_controller_test.go index 4f680f6db..605bf6ffd 100644 --- a/controllers/drplacementcontrol_controller_test.go +++ b/controllers/drplacementcontrol_controller_test.go @@ -1519,6 +1519,7 @@ func runFailoverAction(placementObj client.Object, fromCluster, toCluster string Expect(len(drpc.Status.Conditions)).To(Equal(2)) _, condition := getDRPCCondition(&drpc.Status, rmn.ConditionAvailable) Expect(condition.Reason).To(Equal(string(rmn.FailedOver))) + Expect(drpc.Status.ActionStartTime).ShouldNot(BeNil()) decision := getLatestUserPlacementDecision(placementObj.GetName(), placementObj.GetNamespace()) Expect(decision.ClusterName).To(Equal(toCluster)) @@ -2283,8 +2284,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() { Specify("DRClusters", func() { populateDRClusters() }) - When("6 Applications deployed for the first time", func() { - It("Should deploy 6 drpcs", func() { + When("Application deployed for the first time", func() { + It("Should deploy drpc", func() { createNamespacesAsync(getNamespaceObj(DefaultDRPCNamespace)) createManagedClusters(asyncClusters) createDRClustersAsync() @@ -2309,7 +2310,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Secondary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-1 busybox-drpc 12h East1ManagedClus Deployed Completed True - When("DRAction is Initial deploy -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Initial deploy -> Secondary Down", func() { It("Should reconstructs the DRPC state to completion. Primary is East1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2333,32 +2334,31 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-5 busybox-drpc 11h East1ManagedClus Deployed Completed 2023-12-20T12:52:20Z 5m32.467527356s True - When("DRAction is Initial deploy -- during hub recovery -> Primary Down", func() { - It("Should be able to reconstructs the DRPC state to completion. Primary East1ManagedCluster", func() { - setClusterDown(West1ManagedCluster) + When("HubRecovery: DRAction is Initial deploy -> Primary Down", func() { + It("Should pause and wait for user to trigger a failover. Primary East1ManagedCluster", func() { + setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() expectedAction := rmn.DRAction("") - expectedPhase := rmn.Deployed - exptectedPorgression := rmn.ProgressionCompleted + expectedPhase := rmn.WaitForUser + exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) - resetClusterDown() - exptectedCompleted := rmn.ProgressionCompleted - verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedCompleted) }) }) // Failover - When("DRAction is set to Failover - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Failover -> primary cluster down", func() { It("Should failover to West1ManagedCluster", func() { from := East1ManagedCluster to := West1ManagedCluster + resetClusterDown() runFailoverAction(userPlacementRule1, from, to, false, false) - uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, - rmn.VRGAction(rmn.ActionFailover)) waitForDRPCPhaseAndProgression(DefaultDRPCNamespace, rmn.FailedOver) + uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, rmn.VRGActionFailover) + resetClusterDown() }) }) + //nolint:lll // -------- Before Hub Recovery Action FailedOver --- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY @@ -2369,14 +2369,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-2 busybox-drpc 11h East1ManagedClus West1ManagedClu Failover FailedOver Completed True - When("DRAction is Failover -- during hub recovery -> Primary Down", func() { + When("HubRecovery: DRAction is Failover -> Primary Down", func() { It("Should Pause, but allows failover. Primary West1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2393,7 +2393,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { }) // Relocate - When("DRAction is set to Relocate - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY -------------------------------------- from := West1ManagedCluster @@ -2412,7 +2412,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-sample busybox-drpc 16h East1ManagedClus West1ManagedClu Relocate Relocated Completed True - When("DRAction is Relocate -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Relocate -> Secondary Down", func() { It("Should Continue given the primary East1ManagedCluster is up", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2442,14 +2442,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-3 busybox-drpc 11h East1ManagedClus Relocate Relocated Completed True - When("DRAction is supposed to be Relocate -- during hub recovery -> Primary Down -> Action Cleared", func() { + When("HubRecovery: DRAction is supposed to be Relocate -> Primary Down -> Action Cleared", func() { It("Should Pause given the primary East1ManagedCluster is down, but allow failover", func() { setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2495,11 +2495,19 @@ var _ = Describe("DRPlacementControl Reconciler", func() { func verifyDRPCStateAndProgression(expectedAction rmn.DRAction, expectedPhase rmn.DRState, exptectedPorgression rmn.ProgressionStatus, ) { + var phase rmn.DRState + + var progression rmn.ProgressionStatus + Eventually(func() bool { drpc := getLatestDRPC(DefaultDRPCNamespace) + phase = drpc.Status.Phase + progression = drpc.Status.Progression - return drpc.Status.Phase == expectedPhase && drpc.Status.Progression == exptectedPorgression - }, timeout, time.Millisecond*1000).Should(BeTrue(), "Phase has not been updated yet!") + return phase == expectedPhase && progression == exptectedPorgression + }, timeout, time.Millisecond*1000).Should(BeTrue(), + fmt.Sprintf("Phase has not been updated yet! Phase:%s Expected:%s - progression:%s exptected:%s", + phase, expectedPhase, progression, exptectedPorgression)) drpc := getLatestDRPC(DefaultDRPCNamespace) Expect(drpc.Spec.Action).Should(Equal(expectedAction)) @@ -2525,8 +2533,7 @@ func checkConditionAllowFailover(namespace string) { return false }, timeout, interval).Should(BeTrue(), fmt.Sprintf("Condition '%+v'", availableCondition)) - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - Expect(availableCondition.Message).Should(Equal("Failover allowed")) + Expect(drpc.Status.Phase).To(Equal(rmn.WaitForUser)) } func uploadVRGtoS3Store(name, namespace, dstCluster string, action rmn.VRGAction) { diff --git a/controllers/util/mcv_util.go b/controllers/util/mcv_util.go index f48ff5407..ece7a2a4d 100644 --- a/controllers/util/mcv_util.go +++ b/controllers/util/mcv_util.go @@ -62,7 +62,7 @@ type ManagedClusterViewGetterImpl struct { func (m ManagedClusterViewGetterImpl) GetVRGFromManagedCluster(resourceName, resourceNamespace, managedCluster string, annotations map[string]string, ) (*rmn.VolumeReplicationGroup, error) { - logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName) + logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName, "cluster", managedCluster) // get VRG and verify status through ManagedClusterView mcvMeta := metav1.ObjectMeta{ Name: BuildManagedClusterViewName(resourceName, resourceNamespace, "vrg"), @@ -228,7 +228,8 @@ func (m ManagedClusterViewGetterImpl) getManagedClusterResource( return errorswrapper.Wrap(err, "getManagedClusterResource failed") } - logger.Info(fmt.Sprintf("MCV Conditions: %v", mcv.Status.Conditions)) + logger.Info(fmt.Sprintf("Get managedClusterResource Returned the following MCV Conditions: %v", + mcv.Status.Conditions)) return m.GetResource(mcv, resource) }