diff --git a/controllers/drplacementcontrol.go b/controllers/drplacementcontrol.go index e0eac5807..7c4f63a80 100644 --- a/controllers/drplacementcontrol.go +++ b/controllers/drplacementcontrol.go @@ -2335,6 +2335,7 @@ func (d *DRPCInstance) setConditionOnInitialDeploymentCompletion() { func (d *DRPCInstance) setStatusInitiating() { if !(d.instance.Status.Phase == "" || + d.instance.Status.Phase == rmn.WaitForUser || d.instance.Status.Phase == rmn.Deployed || d.instance.Status.Phase == rmn.FailedOver || d.instance.Status.Phase == rmn.Relocated) { diff --git a/controllers/drplacementcontrol_controller.go b/controllers/drplacementcontrol_controller.go index ede531b83..3ee5eec22 100644 --- a/controllers/drplacementcontrol_controller.go +++ b/controllers/drplacementcontrol_controller.go @@ -856,6 +856,8 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( placementObj client.Object, log logr.Logger, ) (*DRPCInstance, error) { + log.Info("Creating DRPC instance") + drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { return nil, err @@ -2168,8 +2170,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( ) (bool, error) { requeue := true + log.Info("Ensure DRPC Status Consistency") + // This will always be false the first time the DRPC resource is first created OR after hub recovery - if drpc.Status.Phase != "" { + if drpc.Status.Phase != "" && drpc.Status.Phase != rmn.WaitForUser { return !requeue, nil } @@ -2178,7 +2182,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( dstCluster = drpc.Spec.FailoverCluster } - progress, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + progress, msg, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + + log.Info(msg) + if err != nil { return requeue, err } @@ -2187,22 +2194,23 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( case Continue: return !requeue, nil case AllowFailover: + drpc.Status.Phase = rmn.WaitForUser updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, - drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") + drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") return requeue, nil default: - msg := "Operation Paused - User Intervention Required." + msg := fmt.Sprintf("Operation Paused - User Intervention Required. %s", msg) - log.Info(fmt.Sprintf("err:%v - msg:%s", err, msg)) + log.Info(msg) updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, drpc.Generation, metav1.ConditionFalse, rmn.ReasonPaused, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, - metav1.ConditionFalse, rmn.ReasonPaused, msg) + metav1.ConditionFalse, rmn.ReasonPaused, "User Intervention Required") return requeue, nil } @@ -2249,19 +2257,19 @@ func (r *DRPlacementControlReconciler) determineDRPCState( placementObj client.Object, dstCluster string, log logr.Logger, -) (Progress, error) { +) (Progress, string, error) { log.Info("Rebuild DRPC state") vrgNamespace, err := selectVRGNamespace(r.Client, log, drpc, placementObj) if err != nil { log.Info("Failed to select VRG namespace") - return Stop, err + return Stop, "", err } drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { - return Stop, err + return Stop, "", err } vrgs, successfullyQueriedClusterCount, failedCluster, err := getVRGsFromManagedClusters( @@ -2269,21 +2277,29 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if err != nil { log.Info("Failed to get a list of VRGs") - return Stop, err + return Stop, "", err } // IF 2 clusters queried, and both queries failed, then STOP if successfullyQueriedClusterCount == 0 { - log.Info("Number of clusters queried is 0. Stop...") + msg := "Stop - Number of clusters queried is 0" - return Stop, nil + return Stop, msg, nil } // IF 2 clusters queried successfully and no VRGs, then continue with initial deployment if successfullyQueriedClusterCount == 2 && len(vrgs) == 0 { log.Info("Queried 2 clusters successfully") - return Continue, nil + return Continue, "", nil + } + + if drpc.Status.Phase == rmn.WaitForUser && + drpc.Spec.Action == rmn.ActionFailover && + drpc.Spec.FailoverCluster != failedCluster { + log.Info("Continue. The action is failover and the failoverCluster is accessible") + + return Continue, "", nil } // IF queried 2 clusters queried, 1 failed and 0 VRG found, then check s3 store. @@ -2297,21 +2313,22 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if vrg == nil { // IF the failed cluster is not the dest cluster, then this could be an initial deploy if failedCluster != dstCluster { - return Continue, nil + return Continue, "", nil } - log.Info("Unable to query all clusters and failed to get VRG from s3 store") + msg := fmt.Sprintf("Unable to query all clusters and failed to get VRG from s3 store. Failed to query %s", + failedCluster) - return Stop, nil + return Stop, msg, nil } - log.Info("VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) + log.Info("Got VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Two different actions - drpc action is '%s'/vrg action from s3 is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + msg := fmt.Sprintf("Failover is allowed - Two different actions - drpcAction is '%s' and vrgAction from s3 is '%s'", + drpc.Spec.Action, vrg.Spec.Action) - return AllowFailover, nil + return AllowFailover, msg, nil } if dstCluster == vrg.GetAnnotations()[DestinationClusterAnnotationKey] && @@ -2319,13 +2336,13 @@ func (r *DRPlacementControlReconciler) determineDRPCState( log.Info(fmt.Sprintf("VRG from s3. Same dstCluster %s/%s. Proceeding...", dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) - return Continue, nil + return Continue, "", nil } - log.Info(fmt.Sprintf("VRG from s3. DRPCAction/vrgAction/DRPCDstClstr/vrgDstClstr %s/%s/%s/%s. Allow Failover...", - drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) + msg := fmt.Sprintf("Failover is allowed - drpcAction:'%s'. vrgAction:'%s'. DRPCDstClstr:'%s'. vrgDstClstr:'%s'.", + drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey]) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF 2 clusters queried, 1 failed and 1 VRG found on the failover cluster, then check the action, if they don't @@ -2341,25 +2358,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( break } - if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Stop! Two different actions - drpc action is '%s'/vrg action is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) && + dstCluster == clusterName { + msg := fmt.Sprintf("Stop - Two different actions for the same cluster - drpcAction:'%s'. vrgAction:'%s'", + drpc.Spec.Action, vrg.Spec.Action) - return Stop, nil + return Stop, msg, nil } if dstCluster != clusterName && vrg.Spec.ReplicationState == rmn.Secondary { - log.Info(fmt.Sprintf("Same Action and dstCluster and ReplicationState %s/%s/%s", + log.Info(fmt.Sprintf("Failover is allowed. Action/dstCluster/ReplicationState %s/%s/%s", drpc.Spec.Action, dstCluster, vrg.Spec.ReplicationState)) - log.Info("Failover is allowed - Primary is assumed in the failed cluster") + msg := "Failover is allowed - Primary is assumed to be on the failed cluster" - return AllowFailover, nil + return AllowFailover, msg, nil } - log.Info("Allow to continue") + log.Info("Same action, dstCluster, and ReplicationState is primary. Continuing") - return Continue, nil + return Continue, "", nil } // Finally, IF 2 clusters queried successfully and 1 or more VRGs found, and if one of the VRGs is on the dstCluster, @@ -2379,25 +2397,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( // This can happen if a hub is recovered in the middle of a Relocate if vrg.Spec.ReplicationState == rmn.Secondary && len(vrgs) == 2 { - log.Info("Both VRGs are in secondary state") + msg := "Stop - Both VRGs have the same secondary state" - return Stop, nil + return Stop, msg, nil } if drpc.Spec.Action == rmn.DRAction(vrg.Spec.Action) && dstCluster == clusterName { - log.Info(fmt.Sprintf("Same Action %s", drpc.Spec.Action)) + log.Info(fmt.Sprintf("Same Action and dest cluster %s/%s", drpc.Spec.Action, dstCluster)) - return Continue, nil + return Continue, "", nil } - log.Info("Failover is allowed", "vrgs count", len(vrgs), "drpc action", - drpc.Spec.Action, "vrg action", vrg.Spec.Action, "dstCluster/clusterName", dstCluster+"/"+clusterName) + msg := fmt.Sprintf("Failover is allowed - VRGs count:'%d'. drpcAction:'%s'."+ + " vrgAction:'%s'. DstCluster:'%s'. vrgOnCluste '%s'", + len(vrgs), drpc.Spec.Action, vrg.Spec.Action, dstCluster, clusterName) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF none of the above, then allow failover (set PeerReady), but stop until someone makes the change - log.Info("Failover is allowed, but user intervention is required") + msg := "Failover is allowed - User intervention is required" - return AllowFailover, nil + return AllowFailover, msg, nil } diff --git a/controllers/drplacementcontrol_controller_test.go b/controllers/drplacementcontrol_controller_test.go index 4f680f6db..605bf6ffd 100644 --- a/controllers/drplacementcontrol_controller_test.go +++ b/controllers/drplacementcontrol_controller_test.go @@ -1519,6 +1519,7 @@ func runFailoverAction(placementObj client.Object, fromCluster, toCluster string Expect(len(drpc.Status.Conditions)).To(Equal(2)) _, condition := getDRPCCondition(&drpc.Status, rmn.ConditionAvailable) Expect(condition.Reason).To(Equal(string(rmn.FailedOver))) + Expect(drpc.Status.ActionStartTime).ShouldNot(BeNil()) decision := getLatestUserPlacementDecision(placementObj.GetName(), placementObj.GetNamespace()) Expect(decision.ClusterName).To(Equal(toCluster)) @@ -2283,8 +2284,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() { Specify("DRClusters", func() { populateDRClusters() }) - When("6 Applications deployed for the first time", func() { - It("Should deploy 6 drpcs", func() { + When("Application deployed for the first time", func() { + It("Should deploy drpc", func() { createNamespacesAsync(getNamespaceObj(DefaultDRPCNamespace)) createManagedClusters(asyncClusters) createDRClustersAsync() @@ -2309,7 +2310,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Secondary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-1 busybox-drpc 12h East1ManagedClus Deployed Completed True - When("DRAction is Initial deploy -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Initial deploy -> Secondary Down", func() { It("Should reconstructs the DRPC state to completion. Primary is East1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2333,32 +2334,31 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-5 busybox-drpc 11h East1ManagedClus Deployed Completed 2023-12-20T12:52:20Z 5m32.467527356s True - When("DRAction is Initial deploy -- during hub recovery -> Primary Down", func() { - It("Should be able to reconstructs the DRPC state to completion. Primary East1ManagedCluster", func() { - setClusterDown(West1ManagedCluster) + When("HubRecovery: DRAction is Initial deploy -> Primary Down", func() { + It("Should pause and wait for user to trigger a failover. Primary East1ManagedCluster", func() { + setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() expectedAction := rmn.DRAction("") - expectedPhase := rmn.Deployed - exptectedPorgression := rmn.ProgressionCompleted + expectedPhase := rmn.WaitForUser + exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) - resetClusterDown() - exptectedCompleted := rmn.ProgressionCompleted - verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedCompleted) }) }) // Failover - When("DRAction is set to Failover - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Failover -> primary cluster down", func() { It("Should failover to West1ManagedCluster", func() { from := East1ManagedCluster to := West1ManagedCluster + resetClusterDown() runFailoverAction(userPlacementRule1, from, to, false, false) - uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, - rmn.VRGAction(rmn.ActionFailover)) waitForDRPCPhaseAndProgression(DefaultDRPCNamespace, rmn.FailedOver) + uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, rmn.VRGActionFailover) + resetClusterDown() }) }) + //nolint:lll // -------- Before Hub Recovery Action FailedOver --- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY @@ -2369,14 +2369,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-2 busybox-drpc 11h East1ManagedClus West1ManagedClu Failover FailedOver Completed True - When("DRAction is Failover -- during hub recovery -> Primary Down", func() { + When("HubRecovery: DRAction is Failover -> Primary Down", func() { It("Should Pause, but allows failover. Primary West1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2393,7 +2393,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { }) // Relocate - When("DRAction is set to Relocate - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY -------------------------------------- from := West1ManagedCluster @@ -2412,7 +2412,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-sample busybox-drpc 16h East1ManagedClus West1ManagedClu Relocate Relocated Completed True - When("DRAction is Relocate -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Relocate -> Secondary Down", func() { It("Should Continue given the primary East1ManagedCluster is up", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2442,14 +2442,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-3 busybox-drpc 11h East1ManagedClus Relocate Relocated Completed True - When("DRAction is supposed to be Relocate -- during hub recovery -> Primary Down -> Action Cleared", func() { + When("HubRecovery: DRAction is supposed to be Relocate -> Primary Down -> Action Cleared", func() { It("Should Pause given the primary East1ManagedCluster is down, but allow failover", func() { setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2495,11 +2495,19 @@ var _ = Describe("DRPlacementControl Reconciler", func() { func verifyDRPCStateAndProgression(expectedAction rmn.DRAction, expectedPhase rmn.DRState, exptectedPorgression rmn.ProgressionStatus, ) { + var phase rmn.DRState + + var progression rmn.ProgressionStatus + Eventually(func() bool { drpc := getLatestDRPC(DefaultDRPCNamespace) + phase = drpc.Status.Phase + progression = drpc.Status.Progression - return drpc.Status.Phase == expectedPhase && drpc.Status.Progression == exptectedPorgression - }, timeout, time.Millisecond*1000).Should(BeTrue(), "Phase has not been updated yet!") + return phase == expectedPhase && progression == exptectedPorgression + }, timeout, time.Millisecond*1000).Should(BeTrue(), + fmt.Sprintf("Phase has not been updated yet! Phase:%s Expected:%s - progression:%s exptected:%s", + phase, expectedPhase, progression, exptectedPorgression)) drpc := getLatestDRPC(DefaultDRPCNamespace) Expect(drpc.Spec.Action).Should(Equal(expectedAction)) @@ -2525,8 +2533,7 @@ func checkConditionAllowFailover(namespace string) { return false }, timeout, interval).Should(BeTrue(), fmt.Sprintf("Condition '%+v'", availableCondition)) - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - Expect(availableCondition.Message).Should(Equal("Failover allowed")) + Expect(drpc.Status.Phase).To(Equal(rmn.WaitForUser)) } func uploadVRGtoS3Store(name, namespace, dstCluster string, action rmn.VRGAction) { diff --git a/controllers/util/mcv_util.go b/controllers/util/mcv_util.go index f48ff5407..ece7a2a4d 100644 --- a/controllers/util/mcv_util.go +++ b/controllers/util/mcv_util.go @@ -62,7 +62,7 @@ type ManagedClusterViewGetterImpl struct { func (m ManagedClusterViewGetterImpl) GetVRGFromManagedCluster(resourceName, resourceNamespace, managedCluster string, annotations map[string]string, ) (*rmn.VolumeReplicationGroup, error) { - logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName) + logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName, "cluster", managedCluster) // get VRG and verify status through ManagedClusterView mcvMeta := metav1.ObjectMeta{ Name: BuildManagedClusterViewName(resourceName, resourceNamespace, "vrg"), @@ -228,7 +228,8 @@ func (m ManagedClusterViewGetterImpl) getManagedClusterResource( return errorswrapper.Wrap(err, "getManagedClusterResource failed") } - logger.Info(fmt.Sprintf("MCV Conditions: %v", mcv.Status.Conditions)) + logger.Info(fmt.Sprintf("Get managedClusterResource Returned the following MCV Conditions: %v", + mcv.Status.Conditions)) return m.GetResource(mcv, resource) }