diff --git a/peerpod-ctrl/controllers/peerpod_controller.go b/peerpod-ctrl/controllers/peerpod_controller.go index 27ecf0740..6c217c69c 100644 --- a/peerpod-ctrl/controllers/peerpod_controller.go +++ b/peerpod-ctrl/controllers/peerpod_controller.go @@ -82,6 +82,25 @@ func (r *PeerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } if pp.ObjectMeta.GetDeletionTimestamp() == nil { // TODO: consider filter events + // Create or Update events without DeletionTimestamp + // Check if existing old PeerPod owned by current Pod, + // and delete it to clean the dangling VM. + // It may caused by CAA update, unplanned down or crashes + ppList := confidentialcontainersorgv1alpha1.PeerPodList{} + if err := r.List(ctx, &ppList); err != nil { + logger.Info("Failed to get PeerPod list", "error", err) + return ctrl.Result{}, err + } + + for _, item := range ppList.Items { + if isOldPeerPod(item, pp) { + logger.Info("Found old PeerPod object owned by the same Pod", "old PeerPod", item) + if err := r.Delete(ctx, &item); err != nil { + logger.Info("Failed to delete old PeerPod", "error", err) + } + } + } + return ctrl.Result{}, nil } @@ -154,3 +173,9 @@ func SetProvider() (cloud.Provider, error) { return nil, fmt.Errorf("cloudmgr: %s cloud provider not supported", cloudName) } + +func isOldPeerPod(pp, cur confidentialcontainersorgv1alpha1.PeerPod) bool { + return pp.OwnerReferences[0].UID == cur.OwnerReferences[0].UID && // Same owner + pp.UID != cur.UID && // Not cur itself + pp.CreationTimestamp.Before(&cur.CreationTimestamp) // Created before cur +} diff --git a/test/e2e/rolling_update_test.go b/test/e2e/rolling_update_test.go index 7dc47383a..cc2b620de 100644 --- a/test/e2e/rolling_update_test.go +++ b/test/e2e/rolling_update_test.go @@ -27,7 +27,7 @@ import ( ) const WAIT_DEPLOYMENT_AVAILABLE_TIMEOUT = time.Second * 180 -const OLD_VM_DELETION_TIMEOUT = time.Second * 15 +const OLD_VM_DELETION_TIMEOUT = time.Second * 60 func doTestCaaDaemonsetRollingUpdate(t *testing.T, assert RollingUpdateAssert) { runtimeClassName := "kata-remote" @@ -229,6 +229,10 @@ func doTestCaaDaemonsetRollingUpdate(t *testing.T, assert RollingUpdateAssert) { t.Fatal(fmt.Errorf("verify pod is not running")) } + time.Sleep(OLD_VM_DELETION_TIMEOUT) + log.Info("Verify old VM instances have been deleted:") + assert.VerifyOldVmDeleted(t) + return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { @@ -252,10 +256,6 @@ func doTestCaaDaemonsetRollingUpdate(t *testing.T, assert RollingUpdateAssert) { t.Fatal(err) } - time.Sleep(OLD_VM_DELETION_TIMEOUT) - log.Info("Verify old VM instances have been deleted:") - assert.VerifyOldVmDeleted(t) - return ctx }).Feature()