Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MULTIARCH-5181: pod-level circuit-breaking for image registry inspection errors #377

Merged
merged 1 commit into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions controllers/podplacement/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ const (
ImageArchitectureInspectionErrorMsg = "Failed to retrieve the supported architectures: "
NoSupportedArchitecturesFoundMsg = "Pod cannot be scheduled due to incompatible image architectures; container images have no supported architectures in common"
ArchitectureAwareGatedPodIgnoredMsg = "The gated pod has been modified and is no longer eligible for architecture-aware scheduling"
ImageInspectionErrorMaxRetriesMsg = "Failed to retrieve the supported architectures after multiple retries"
)
71 changes: 64 additions & 7 deletions controllers/podplacement/pod_model.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package podplacement
import (
"context"
"fmt"
"strconv"
"strings"
"time"

Expand All @@ -37,6 +38,8 @@ var (
imageInspectionCache image.ICache = image.FacadeSingleton()
)

const MaxRetryCount = 5

type containerImage struct {
imageName string
skipCache bool
Expand Down Expand Up @@ -96,16 +99,12 @@ func (pod *Pod) RemoveSchedulingGate() {
// It verifies first that no nodeSelector field is set for the kubernetes.io/arch label.
// Then, it computes the intersection of the architectures supported by the images used by the pod via pod.getArchitecturePredicate.
// Finally, it initializes the nodeAffinity for the pod and set it to the computed requirement via the pod.setArchNodeAffinity method.
func (pod *Pod) SetNodeAffinityArchRequirement(pullSecretDataList [][]byte) {
log := ctrllog.FromContext(pod.ctx)
func (pod *Pod) SetNodeAffinityArchRequirement(pullSecretDataList [][]byte) (bool, error) {
requirement, err := pod.getArchitecturePredicate(pullSecretDataList)
if err != nil {
pod.ensureLabel(utils.ImageInspectionErrorLabel, "")
metrics.FailedInspectionCounter.Inc()
pod.publishEvent(corev1.EventTypeWarning, ImageArchitectureInspectionError, ImageArchitectureInspectionErrorMsg+err.Error())
log.Error(err, "Error getting the architecture predicate. The pod will not have the nodeAffinity set.")
return
return false, err
}
pod.ensureNoLabel(utils.ImageInspectionErrorLabel)
if len(requirement.Values) == 0 {
pod.publishEvent(corev1.EventTypeNormal, NoSupportedArchitecturesFound, NoSupportedArchitecturesFoundMsg)
}
Expand All @@ -124,6 +123,7 @@ func (pod *Pod) SetNodeAffinityArchRequirement(pullSecretDataList [][]byte) {
}

pod.setArchNodeAffinity(requirement)
return true, nil
}

// setArchNodeAffinity sets the node affinity for the pod to the given requirement based on the rules in
Expand Down Expand Up @@ -244,6 +244,50 @@ func (pod *Pod) ensureLabel(label string, value string) {
pod.Labels[label] = value
}

func (pod *Pod) ensureNoLabel(label string) {
if pod.Labels == nil {
return
}
delete(pod.Labels, label)
}

// ensureLabel ensures that the pod has the given label with the given value.
func (pod *Pod) ensureAnnotation(annotation string, value string) {
if pod.Annotations == nil {
pod.Annotations = make(map[string]string)
}
pod.Annotations[annotation] = value
}

// ensureAndIncrementLabel ensures that the pod has the given label with the given value.
// If the label is already set, it increments the value.
func (pod *Pod) ensureAndIncrementLabel(label string) {
if pod.Labels == nil {
pod.Labels = make(map[string]string)
}
if _, ok := pod.Labels[label]; !ok {
pod.Labels[label] = "1"
return
}
cur, err := strconv.ParseInt(pod.Labels[label], 10, 32)
if err != nil {
pod.Labels[label] = "1"
} else {
pod.Labels[label] = fmt.Sprintf("%d", cur+1)
}
}

func (pod *Pod) maxRetries() bool {
if pod.Labels == nil {
return false
}
v, err := strconv.ParseInt(pod.Labels[utils.ImageInspectionErrorCountLabel], 10, 32)
if err != nil {
return true
}
return v >= MaxRetryCount
}

// ensureArchitectureLabels adds labels for the given requirement to the pod. Labels are added to indicate
// the supported architectures and index pods by architecture or by whether they support more than one architecture.
// In this case, single-architecture is meant as a pod that supports only one architecture: all the images in the pod
Expand Down Expand Up @@ -359,3 +403,16 @@ func (pod *Pod) publishIgnorePod() {
pod.ensureLabel(utils.NodeAffinityLabel, utils.LabelValueNotSet)
pod.publishEvent(corev1.EventTypeNormal, ArchitecturePredicatesConflict, ArchitecturePredicatesConflictMsg)
}

func (pod *Pod) handleError(err error, s string) {
if err == nil {
return
}
log := ctrllog.FromContext(pod.ctx)
metrics.FailedInspectionCounter.Inc()
pod.ensureLabel(utils.ImageInspectionErrorLabel, "")
pod.ensureAnnotation(utils.ImageInspectionErrorLabel, err.Error())
pod.ensureAndIncrementLabel(utils.ImageInspectionErrorCountLabel)
pod.publishEvent(corev1.EventTypeWarning, ImageArchitectureInspectionError, ImageArchitectureInspectionErrorMsg+err.Error())
log.Error(err, s)
}
15 changes: 11 additions & 4 deletions controllers/podplacement/pod_model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ func TestPod_SetNodeAffinityArchRequirement(t *testing.T) {
pullSecretDataList [][]byte
pod *v1.Pod
want *v1.Pod
expectErr bool
}{
{
name: "pod with no node selector terms",
Expand Down Expand Up @@ -685,9 +686,10 @@ func TestPod_SetNodeAffinityArchRequirement(t *testing.T) {
}).Build(),
},
{
name: "should not modify the pod if unable to inspect the images",
pod: NewPod().WithContainersImages(fake.MultiArchImage, "non-readable-image").Build(),
want: NewPod().WithContainersImages(fake.MultiArchImage, "non-readable-image").Build(),
name: "should not modify the pod if unable to inspect the images",
pod: NewPod().WithContainersImages(fake.MultiArchImage, "non-readable-image").Build(),
want: NewPod().WithContainersImages(fake.MultiArchImage, "non-readable-image").Build(),
expectErr: true,
},
{
name: "should prevent the pod from being scheduled when no common architecture is found",
Expand All @@ -709,8 +711,13 @@ func TestPod_SetNodeAffinityArchRequirement(t *testing.T) {
Pod: *tt.pod,
ctx: ctx,
}
pod.SetNodeAffinityArchRequirement(tt.pullSecretDataList)
_, err := pod.SetNodeAffinityArchRequirement(tt.pullSecretDataList)
g := NewGomegaWithT(t)
if tt.expectErr {
g.Expect(err).Should(HaveOccurred())
} else {
g.Expect(err).ShouldNot(HaveOccurred())
}
g.Expect(pod.Spec.Affinity).Should(Equal(tt.want.Spec.Affinity))
imageInspectionCache = mmoimage.FacadeSingleton()
})
Expand Down
76 changes: 45 additions & 31 deletions controllers/podplacement/pod_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package podplacement

import (
"context"
"fmt"
runtime2 "runtime"
"time"

Expand Down Expand Up @@ -77,52 +78,65 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
log.V(2).Info("Unable to fetch pod", "error", err)
return ctrl.Result{}, client.IgnoreNotFound(err)
}

// verify whether the pod has the scheduling gate
// Pods without the scheduling gate should be ignored.
if !pod.HasSchedulingGate() {
log.V(2).Info("Pod does not have the scheduling gate. Ignoring...")
// if not, return
return ctrl.Result{}, nil
}

// The scheduling gate is found.
metrics.ProcessedPodsCtrl.Inc()
defer utils.HistogramObserve(now, metrics.TimeToProcessGatedPod)
log.V(1).Info("Processing pod")
r.processPod(ctx, pod)
err := r.Client.Update(ctx, &pod.Pod)
if err != nil {
log.Error(err, "Unable to update the pod")
pod.publishEvent(corev1.EventTypeWarning, ArchitectureAwareSchedulingGateRemovalFailure, SchedulingGateRemovalFailureMsg)
return ctrl.Result{}, err
}
if !pod.HasSchedulingGate() {
// Only publish the event if the scheduling gate has been removed and the pod has been updated successfully.
pod.publishEvent(corev1.EventTypeNormal, ArchitectureAwareSchedulingGateRemovalSuccess, SchedulingGateRemovalSuccessMsg)
metrics.GatedPodsGauge.Dec()
}
return ctrl.Result{}, nil
}

if !pod.shouldIgnorePod() {
// Prepare the requirement for the node affinity.
psdl, err := r.pullSecretDataList(ctx, pod)
if err != nil {
log.Error(err, "Unable to retrieve the image pull secret data for the pod. "+
"The nodeAffinity for this pod will not be set.")
// we still need to remove the scheduling gate. Therefore, we do not return here.
} else {
pod.SetNodeAffinityArchRequirement(psdl)
}
} else {
func (r *PodReconciler) processPod(ctx context.Context, pod *Pod) {
log := ctrllog.FromContext(ctx)
log.V(1).Info("Processing pod")
if pod.shouldIgnorePod() {
log.V(3).Info("A pod with the scheduling gate should be ignored. Ignoring...")
// We can reach this branch when:
// - The pod has been gated but not processed before the operator changed configuration such that the pod should be ignored.
// - The pod has got some other changes in the admission chain from another webhook that makes it not suitable for processing anymore
// (for example another actor set the nodeAffinity already for the kubernetes.io/arch label).
// In both cases, we should just remove the scheduling gate.
r.Recorder.Event(&pod.Pod, corev1.EventTypeWarning, ArchitectureAwareGatedPodIgnored, ArchitectureAwareGatedPodIgnoredMsg)
log.V(1).Info("Removing the scheduling gate from pod.")
pod.RemoveSchedulingGate()
pod.publishEvent(corev1.EventTypeWarning, ArchitectureAwareGatedPodIgnored, ArchitectureAwareGatedPodIgnoredMsg)
return
}
// Remove the scheduling gate
log.V(1).Info("Removing the scheduling gate from pod.")
pod.RemoveSchedulingGate()

err := r.Client.Update(ctx, &pod.Pod)
if err != nil {
log.Error(err, "Unable to update the pod")
r.Recorder.Event(&pod.Pod, corev1.EventTypeWarning, ArchitectureAwareSchedulingGateRemovalFailure, SchedulingGateRemovalFailureMsg)
return ctrl.Result{}, err
// Prepare the requirement for the node affinity.
psdl, err := r.pullSecretDataList(ctx, pod)
pod.handleError(err, "Unable to retrieve the image pull secret data for the pod.")
// If no error occurred when retrieving the image pull secret data, set the node affinity.
if err == nil {
_, err = pod.SetNodeAffinityArchRequirement(psdl)
pod.handleError(err, "Unable to set the node affinity for the pod.")
}
if pod.maxRetries() && err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one worry i have is that we might get rate limited by the registry if it is slow..maybe a backoff of sorts would be good to implement.. i.e, if the same pod is again queued, we might want to give it some time

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @Prashanth684 the (exponential) backoff is already implemented through the controller-runtime facilities.

// the number of retries is incremented in the handleError function when the error is not nil.
// If we enter this branch, the retries counter has been incremented and reached the max retries.
// The counter starts at 1 when the first error occurs. Therefore, when the reconciler tries maxRetries times,
// the counter is equal to the maxRetries value and the pod should not be processed again.
// Publish this event and remove the scheduling gate.
log.Info("Max retries Reached. The pod will not have the nodeAffinity set.")
pod.publishEvent(corev1.EventTypeWarning, ImageArchitectureInspectionError, fmt.Sprintf("%s: %s", ImageInspectionErrorMaxRetriesMsg, err.Error()))
}
// If the pod has been processed successfully or the max retries have been reached, remove the scheduling gate.
if err == nil || pod.maxRetries() {
log.V(1).Info("Removing the scheduling gate from pod.")
pod.RemoveSchedulingGate()
}
r.Recorder.Event(&pod.Pod, corev1.EventTypeNormal, ArchitectureAwareSchedulingGateRemovalSuccess, SchedulingGateRemovalSuccessMsg)
metrics.GatedPodsGauge.Dec()

return ctrl.Result{}, nil
}

// pullSecretDataList returns the list of secrets data for the given pod given its imagePullSecrets field
Expand Down
34 changes: 33 additions & 1 deletion controllers/podplacement/pod_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ package podplacement

import (
"fmt"
"strconv"
"time"

"github.com/openshift/multiarch-tuning-operator/pkg/e2e"

"k8s.io/apimachinery/pkg/util/sets"

Expand Down Expand Up @@ -70,7 +74,35 @@ var _ = Describe("Controllers/Podplacement/PodReconciler", func() {
Entry("Docker images", imgspecv1.MediaTypeImageManifest, utils.ArchitecturePpc64le),
)
})

Context("with an image that cannot be inspected", func() {
It("remove the scheduling gate after the max retries count", func() {
pod := NewPod().
WithContainersImages("quay.io/non-existing/image:latest").
WithGenerateName("test-pod-").
WithNamespace("test-namespace").
Build()
err := k8sClient.Create(ctx, pod)
Expect(err).NotTo(HaveOccurred(), "failed to create pod", err)
// Test the removal of the scheduling gate. However, since the pod is mutated and the reconciler
// removes the scheduling gate concurrently, we cannot ensure that the scheduling gate is added
// and that the following Eventually works on a pod with the scheduling gate.
// Waiting for the pod to be mutated is not enough, as the pod could be mutated and the reconciler
// could have removed the scheduling gate before our check.
Eventually(func(g Gomega) {
// Get pod from the API server
err := k8sClient.Get(ctx, crclient.ObjectKeyFromObject(pod), pod)
g.Expect(err).NotTo(HaveOccurred(), "failed to get pod", err)
By(fmt.Sprintf("Error count is set to '%s'", pod.Labels[utils.ImageInspectionErrorCountLabel]))
g.Expect(pod.Spec.SchedulingGates).NotTo(ContainElement(corev1.PodSchedulingGate{
Name: utils.SchedulingGateName,
}), "scheduling gate not removed")
g.Expect(pod.Labels).To(HaveKeyWithValue(utils.SchedulingGateLabel, utils.SchedulingGateLabelValueRemoved),
"scheduling gate annotation not found")
g.Expect(pod.Labels).To(HaveKeyWithValue(utils.ImageInspectionErrorCountLabel, strconv.Itoa(MaxRetryCount)), "image inspection error count not found")
}).WithTimeout(e2e.WaitShort).WithPolling(time.Millisecond*250).Should(Succeed(), "failed to remove scheduling gate from pod")
// Polling set to 250ms such that the error count is shown in the logs at each update.
})
})
Context("with different pull secrets", func() {
It("handles images with global pull secrets correctly", func() {
// TODO: Test logic for handling a Pod with one container and image using global pull secret
Expand Down
1 change: 1 addition & 0 deletions pkg/utils/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const (
MultiArchLabel = "multiarch.openshift.io/multi-arch"
NoSupportedArchLabel = "multiarch.openshift.io/no-supported-arch"
ImageInspectionErrorLabel = "multiarch.openshift.io/image-inspect-error"
ImageInspectionErrorCountLabel = "multiarch.openshift.io/image-inspect-error-count"
LabelGroup = "multiarch.openshift.io"
)

Expand Down