Skip to content

Commit

Permalink
fix: remediator missing custom resource events
Browse files Browse the repository at this point in the history
Prior to this change, the remediator watches were only being started
for new custom resources after the apply attempt had fully completed.
This left some time after the object was applied that the remediator
could miss events made by third-parties. Normally, this would be fine,
because the remediator would revert any change after the watch was
started. But if a DELETE event was missed, the object wouldn't be
recreated until the next apply attempt.

This change adds a CRD Controller to the remediator that watches CRDs
and executes any registered handlers when the CRD is established,
unestablished, or deleted. The remediator now registers CRD handlers
for each resource type it watches, starting watchers as soon as
possible, without waiting for the next apply attempt.
  • Loading branch information
karlkfi committed Oct 7, 2024
1 parent 9e80e8d commit fe7bb01
Show file tree
Hide file tree
Showing 17 changed files with 425 additions and 161 deletions.
31 changes: 21 additions & 10 deletions cmd/reconciler-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"os"

"github.com/go-logr/logr"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -33,6 +34,7 @@ import (
"kpt.dev/configsync/pkg/profiler"
"kpt.dev/configsync/pkg/reconcilermanager"
"kpt.dev/configsync/pkg/reconcilermanager/controllers"
"kpt.dev/configsync/pkg/util/customresource"
"kpt.dev/configsync/pkg/util/log"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -95,9 +97,10 @@ func main() {
}
watchFleetMembership := fleetMembershipCRDExists(dynamicClient, mgr.GetRESTMapper(), &setupLog)

crdController := controllers.NewCRDReconciler(
crdController := &controllers.CRDController{}
crdMetaController := controllers.NewCRDMetaController(crdController, mgr.GetCache(),
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName("CRD"))
if err := crdController.Register(mgr); err != nil {
if err := crdMetaController.Register(mgr); err != nil {
setupLog.Error(err, "failed to register controller", "controller", "CRD")
os.Exit(1)
}
Expand All @@ -108,11 +111,15 @@ func main() {
mgr.GetClient(), watcher, dynamicClient,
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName(configsync.RepoSyncKind),
mgr.GetScheme())
crdController.SetCRDHandler(configsync.RepoSyncCRDName, func() error {
if err := repoSyncController.Register(mgr, watchFleetMembership); err != nil {
return fmt.Errorf("registering %s controller: %w", configsync.RepoSyncKind, err)
crdController.SetReconciler(kinds.RepoSyncV1Beta1().GroupKind(), func(_ context.Context, crd *apiextensionsv1.CustomResourceDefinition) error {
if customresource.IsEstablished(crd) {
if err := repoSyncController.Register(mgr, watchFleetMembership); err != nil {
return fmt.Errorf("registering %s controller: %w", configsync.RepoSyncKind, err)
}
setupLog.Info("RepoSync controller registration successful")
}
setupLog.Info("RepoSync controller registration successful")
// Don't stop the RepoSync controller when its CRD is deleted,
// otherwise we may miss RepoSync object deletion events.
return nil
})
setupLog.Info("RepoSync controller registration scheduled")
Expand All @@ -122,11 +129,15 @@ func main() {
mgr.GetClient(), watcher, dynamicClient,
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName(configsync.RootSyncKind),
mgr.GetScheme())
crdController.SetCRDHandler(configsync.RootSyncCRDName, func() error {
if err := rootSyncController.Register(mgr, watchFleetMembership); err != nil {
return fmt.Errorf("registering %s controller: %w", configsync.RootSyncKind, err)
crdController.SetReconciler(kinds.RootSyncV1Beta1().GroupKind(), func(_ context.Context, crd *apiextensionsv1.CustomResourceDefinition) error {
if customresource.IsEstablished(crd) {
if err := rootSyncController.Register(mgr, watchFleetMembership); err != nil {
return fmt.Errorf("registering %s controller: %w", configsync.RootSyncKind, err)
}
setupLog.Info("RootSync controller registration successful")
}
setupLog.Info("RootSync controller registration successful")
// Don't stop the RootSync controller when its CRD is deleted,
// otherwise we may miss RootSync object deletion events.
return nil
})
setupLog.Info("RootSync controller registration scheduled")
Expand Down
17 changes: 10 additions & 7 deletions e2e/testcases/custom_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,16 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1(t *testing.T) {
nt.T.Fatal(err)
}

// Resource Conflict errors from the remediator are not exposed as errors
// in the RootSync status. Instead, the error is recorded as a metric and
// logged as a warning. Then the object is refreshed from the server and
// re-enqueued for remediation.
// Resource conflict errors are recorded as status errors when the
// remediator watches are updated after an apply succeeds, but not when
// watches are updated before the apply attempt or from watch events handled
// by the remediator. So we don't expect to see a resource conflict error
// in the RootSync status until after the next apply attempt fails, which
// won't happen until the next automatic re-sync (1hr default).
//
// Validate that deleting the CRD of a managed CR causes at least of of the
// following errors:
// However, we do expect the remediator to get a deletion event for each
// Anvil object after the Anvil CRD is deleted. This can be surfaced as one
// of the following errors:
// - NoResourceMatchError
// - NoKindMatchError
// - ObjectNotFound
Expand All @@ -117,7 +120,7 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1(t *testing.T) {
// TODO: distinguish between management conflict (unexpected manager annotation) and resource conflict (resource version change)
nt.Must(nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, firstCommitHash, metrics.ErrorSummary{
Conflicts: 1,
Conflicts: 1, // at least 1
})))

// Reset discovery client to invalidate the cached Anvil CRD
Expand Down
1 change: 1 addition & 0 deletions manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ resources:
# Applying hierarchyconfig-crd.yaml allows client-side validation of the HierarchyConfig resources.
- ../hierarchyconfig-crd.yaml
- ../namespace-selector-crd.yaml
- ../ns-reconciler-cluster-scope-cluster-role.yaml
- ../ns-reconciler-base-cluster-role.yaml
- ../root-reconciler-base-cluster-role.yaml
- ../otel-agent-cm.yaml
Expand Down
27 changes: 27 additions & 0 deletions manifests/ns-reconciler-cluster-scope-cluster-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This ClusterRole is used by both root-reconcilers and ns-reconcilers.
# It includes read access for cluster-scope resources.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: configsync.gke.io:ns-reconciler:cluster-scope
labels:
configmanagement.gke.io/system: "true"
configmanagement.gke.io/arch: "csmr"
rules:
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["get","list","watch"]
3 changes: 3 additions & 0 deletions manifests/root-reconciler-base-cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ rules:
- apiGroups: ["kpt.dev"]
resources: ["resourcegroups/status"]
verbs: ["*"]
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["get","list","watch"]
10 changes: 9 additions & 1 deletion pkg/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"kpt.dev/configsync/pkg/parse/events"
"kpt.dev/configsync/pkg/reconciler/finalizer"
"kpt.dev/configsync/pkg/reconciler/namespacecontroller"
"kpt.dev/configsync/pkg/reconcilermanager/controllers"
"kpt.dev/configsync/pkg/remediator"
"kpt.dev/configsync/pkg/remediator/conflict"
"kpt.dev/configsync/pkg/remediator/watch"
Expand Down Expand Up @@ -219,10 +220,11 @@ func Run(opts Options) {
klog.Fatalf("Error creating rest config for the remediator: %v", err)
}

crdController := &controllers.CRDController{}
conflictHandler := conflict.NewHandler()
fightHandler := fight.NewHandler()

rem, err := remediator.New(opts.ReconcilerScope, opts.SyncName, cfgForWatch, baseApplier, conflictHandler, fightHandler, decls, opts.NumWorkers)
rem, err := remediator.New(opts.ReconcilerScope, opts.SyncName, cfgForWatch, baseApplier, conflictHandler, fightHandler, crdController, decls, opts.NumWorkers)
if err != nil {
klog.Fatalf("Instantiating Remediator: %v", err)
}
Expand Down Expand Up @@ -331,6 +333,12 @@ func Run(opts Options) {
klog.Fatalf("Instantiating Controller Manager: %v", err)
}

crdMetaController := controllers.NewCRDMetaController(crdController, mgr.GetCache(),
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName("CRD"))
if err := crdMetaController.Register(mgr); err != nil {
klog.Fatalf("Instantiating CRD Controller: %v", err)
}

// This cancelFunc will be used by the Finalizer to stop all the other
// controllers (Parser & Remediator).
ctx, stopControllers := context.WithCancel(signalCtx)
Expand Down
14 changes: 12 additions & 2 deletions pkg/reconcilermanager/controllers/build_names.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,25 @@ import (
)

const (
// RepoSyncClusterScopeClusterRoleName is the name of the ClusterRole with
// cluster-scoped read permissions for the namespace reconciler.
// e.g. configsync.gke.io:ns-reconciler:cluster-scope
RepoSyncClusterScopeClusterRoleName = configsync.GroupName + ":" + core.NsReconcilerPrefix + ":cluster-scope"
// RepoSyncBaseClusterRoleName is the namespace reconciler permissions name.
// e.g. configsync.gke.io:ns-reconciler
RepoSyncBaseClusterRoleName = configsync.GroupName + ":" + core.NsReconcilerPrefix
// RootSyncBaseClusterRoleName is the root reconciler base ClusterRole name.
// e.g. configsync.gke.io:root-reconciler
RootSyncBaseClusterRoleName = configsync.GroupName + ":" + core.RootReconcilerPrefix
// RepoSyncClusterScopeClusterRoleBindingName is the name of the default
// ClusterRoleBinding created for RepoSync objects. This contains basic
// cluster-scoped permissions for RepoSync reconcilers
// (e.g. CustomResourceDefinition watch).
RepoSyncClusterScopeClusterRoleBindingName = RepoSyncClusterScopeClusterRoleName
// RepoSyncBaseRoleBindingName is the name of the default RoleBinding created
// for RepoSync objects. This contains basic permissions for RepoSync reconcilers
//(e.g. RepoSync status update).
// for RepoSync objects. This contains basic namespace-scoped permissions
// for RepoSync reconcilers
// (e.g. RepoSync status update).
RepoSyncBaseRoleBindingName = RepoSyncBaseClusterRoleName
// RootSyncLegacyClusterRoleBindingName is the name of the legacy ClusterRoleBinding created
// for RootSync objects. It is always bound to cluster-admin.
Expand Down
Loading

0 comments on commit fe7bb01

Please sign in to comment.