diff --git a/api/v1alpha1/volumereplicationgroup_types.go b/api/v1alpha1/volumereplicationgroup_types.go index 868c2bd20..61a53bed7 100644 --- a/api/v1alpha1/volumereplicationgroup_types.go +++ b/api/v1alpha1/volumereplicationgroup_types.go @@ -35,6 +35,13 @@ const ( UnknownState State = "Unknown" ) +type WorkloadStatus string + +const ( + WorkloadStatusUnknown WorkloadStatus = "Unknown" + WorkloadStatusInactive WorkloadStatus = "Inactive" +) + // VRGAsyncSpec has the parameters associated with RegionalDR type VRGAsyncSpec struct { // Label selector to identify the VolumeReplicationClass resources @@ -336,8 +343,9 @@ type VolumeReplicationGroupStatus struct { //+optional KubeObjectProtection KubeObjectProtectionStatus `json:"kubeObjectProtection,omitempty"` - PrepareForFinalSyncComplete bool `json:"prepareForFinalSyncComplete,omitempty"` - FinalSyncComplete bool `json:"finalSyncComplete,omitempty"` + PrepareForFinalSyncComplete bool `json:"prepareForFinalSyncComplete,omitempty"` + FinalSyncComplete bool `json:"finalSyncComplete,omitempty"` + WorkloadStatus string `json:"workloadStatus,omitempty"` // lastGroupSyncTime is the time of the most recent successful synchronization of all PVCs //+optional diff --git a/config/crd/bases/ramendr.openshift.io_protectedvolumereplicationgrouplists.yaml b/config/crd/bases/ramendr.openshift.io_protectedvolumereplicationgrouplists.yaml index e092eabad..b9a4a1dd6 100644 --- a/config/crd/bases/ramendr.openshift.io_protectedvolumereplicationgrouplists.yaml +++ b/config/crd/bases/ramendr.openshift.io_protectedvolumereplicationgrouplists.yaml @@ -1088,6 +1088,8 @@ spec: description: State captures the latest state of the replication operation type: string + workloadStatus: + type: string type: object type: object type: array diff --git a/config/crd/bases/ramendr.openshift.io_volumereplicationgroups.yaml b/config/crd/bases/ramendr.openshift.io_volumereplicationgroups.yaml index 897d2e33f..696014ebf 100644 --- a/config/crd/bases/ramendr.openshift.io_volumereplicationgroups.yaml +++ b/config/crd/bases/ramendr.openshift.io_volumereplicationgroups.yaml @@ -1022,6 +1022,8 @@ spec: state: description: State captures the latest state of the replication operation type: string + workloadStatus: + type: string type: object type: object served: true diff --git a/internal/controller/drplacementcontrol.go b/internal/controller/drplacementcontrol.go index 5c9c57715..0ea9a2164 100644 --- a/internal/controller/drplacementcontrol.go +++ b/internal/controller/drplacementcontrol.go @@ -373,7 +373,7 @@ func (d *DRPCInstance) RunFailover() (bool, error) { return !done, nil } - return d.ensureActionCompleted(failoverCluster) + return d.ensureFailoverActionCompleted(failoverCluster) } else if yes, err := d.mwExistsAndPlacementUpdated(failoverCluster); yes || err != nil { // We have to wait for the VRG to appear on the failoverCluster or // in case of an error, try again later @@ -863,7 +863,7 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation, metav1.ConditionTrue, string(d.instance.Status.Phase), "Completed") - return d.ensureActionCompleted(preferredCluster) + return d.ensureRelocateActionCompleted(preferredCluster) } d.setStatusInitiating() @@ -896,6 +896,35 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { return d.relocate(preferredCluster, preferredClusterNamespace, rmn.Relocating) } +func (d *DRPCInstance) ensureRelocateActionCompleted(srcCluster string) (bool, error) { + d.setProgression(rmn.ProgressionCleaningUp) + + return d.ensureActionCompleted(srcCluster) +} + +func (d *DRPCInstance) ensureFailoverActionCompleted(srcCluster string) (bool, error) { + done := true + + // This is the time to cleanup the workload from the preferredCluster. + // For managed apps, it will be done automatically by ACM, when we update + // the placement to the targetCluster. For discovered apps, we have to let + // the user know that they need to clean up the apps. + // So set the progression to wait on user to clean up. + // If not discovered apps, then we can set the progression to cleaning up. + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + d.setProgression(rmn.ProgressionCleaningUp) + } + + if !d.ensureWorkloadInactive(srcCluster) { + return !done, nil + } + + return d.ensureActionCompleted(srcCluster) +} + func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) { const done = true @@ -909,8 +938,6 @@ func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) { return !done, err } - d.setProgression(rmn.ProgressionCleaningUp) - // Cleanup and setup VolSync if enabled err = d.ensureCleanupAndVolSyncReplicationSetup(srcCluster) if err != nil { @@ -974,8 +1001,19 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation, d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), "Starting quiescing for relocation") - // clear current user PlacementRule's decision - d.setProgression(rmn.ProgressionClearingPlacement) + // We are going to clear the placement, this is when ACM will start + // deleting the workloads from the current cluster. In case of + // discovered apps, we have to let the user know that they need to + // clean up the apps from the current cluster. So set the progression + // to wait on user to clean up. For non-discovered apps, we can set the + // progression to clearing placement. + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + // clear current user PlacementRule's decision + d.setProgression(rmn.ProgressionClearingPlacement) + } err := d.clearUserPlacementRuleStatus() if err != nil { @@ -990,6 +1028,11 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) } if !result { + if !d.ensureWorkloadInactive(homeCluster) { + d.log.Info("Final sync is waiting for workload to be inactive", "cluster", homeCluster) + return !done, nil + } + d.setProgression(rmn.ProgressionRunningFinalSync) return !done, nil @@ -2048,10 +2091,6 @@ func (d *DRPCInstance) ensureVRGManifestWorkOnClusterDeleted(clusterName string) d.log.Info("Request not complete yet", "cluster", clusterName) - if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 { - d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) - } - // IF we get here, either the VRG has not transitioned to secondary (yet) or delete didn't succeed. In either cases, // we need to make sure that the VRG object is deleted. IOW, we still have to wait return !done, nil @@ -2067,10 +2106,6 @@ func (d *DRPCInstance) ensureVRGIsSecondaryEverywhere(clusterToSkip string) bool continue } - if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 { - d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) - } - if !d.ensureVRGIsSecondaryOnCluster(clusterName) { d.log.Info("Still waiting for VRG to transition to secondary", "cluster", clusterName) @@ -2081,6 +2116,37 @@ func (d *DRPCInstance) ensureVRGIsSecondaryEverywhere(clusterToSkip string) bool return true } +// ensureWorkloadInactive returns true when the workload is not active on the cluster +func (d *DRPCInstance) ensureWorkloadInactive(clusterName string) bool { + d.log.Info(fmt.Sprintf("Ensure workload is inactive on %s", clusterName)) + + d.mcvRequestInProgress = false + + annotations := make(map[string]string) + + annotations[DRPCNameAnnotation] = d.instance.Name + annotations[DRPCNamespaceAnnotation] = d.instance.Namespace + + vrg, err := d.reconciler.MCVGetter.GetVRGFromManagedCluster(d.instance.Name, + d.vrgNamespace, clusterName, annotations) + if err != nil { + + d.log.Info("Failed to get VRG", "errorValue", err) + + d.mcvRequestInProgress = true + + return false + } + + if vrg.Status.WorkloadStatus != string(rmn.WorkloadStatusInactive) || vrg.Status.ObservedGeneration != vrg.Generation { + d.log.Info(fmt.Sprintf("VRG on %s has active workload.", clusterName)) + + return false + } + + return true +} + // ensureVRGIsSecondaryOnCluster returns true when VRG is secondary or it does not exists on the cluster func (d *DRPCInstance) ensureVRGIsSecondaryOnCluster(clusterName string) bool { d.log.Info(fmt.Sprintf("Ensure VRG %s is secondary on cluster %s", d.instance.Name, clusterName)) diff --git a/internal/controller/volsync/vshandler.go b/internal/controller/volsync/vshandler.go index 6457c9355..09f1ccb96 100644 --- a/internal/controller/volsync/vshandler.go +++ b/internal/controller/volsync/vshandler.go @@ -364,6 +364,9 @@ func (v *VSHandler) validatePVCBeforeRS(rsSpec ramendrv1alpha1.VolSyncReplicatio return false, nil } + // RTALUR TODO + + return true, nil // Good to proceed - PVC is not in use, not mounted to node (or does not exist-should not happen) } @@ -1634,6 +1637,8 @@ func (v *VSHandler) IsRDDataProtected(pvcName, pvcNamespace string) (bool, error func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncReplicationDestinationSpec, ) (*string, error) { if !v.IsCopyMethodDirect() { + // TODO: + // We need to check the workload status even in other cases. v.log.Info("Using default copyMethod of Snapshot") return nil, nil // use default copyMethod @@ -1659,6 +1664,7 @@ func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncRepl util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC)) } + v.log.Info(fmt.Sprintf("Using App PVC %s for syncing directly to it", util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC))) // Using the application PVC for syncing from source to destination and save a snapshot diff --git a/internal/controller/vrg_volsync.go b/internal/controller/vrg_volsync.go index d2059262f..d78c07283 100644 --- a/internal/controller/vrg_volsync.go +++ b/internal/controller/vrg_volsync.go @@ -225,6 +225,7 @@ func (v *VRGInstance) reconcileVolSyncAsSecondary() bool { // If we are secondary, and RDSpec is not set, then we don't want to have any PVC // flagged as a VolSync PVC. if v.instance.Spec.VolSync.RDSpec == nil { + // RTALUR TODO : return idx := 0 for _, protectedPVC := range v.instance.Status.ProtectedPVCs {