From b8de3899e956a2cba7b947147c119445c6d3d37e Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Fri, 9 Aug 2024 17:21:52 +0200 Subject: [PATCH] Fix clean up of old attributes when containers are not restarting When a pod crashes while in the process of starting, the operator cleans up outdated attributes in the galera CR status. The operator wrongly assumes that it can probe a container's state as soon as it gets a pod object from the API server, which is not always true (e.g when the pod is in "Pending" state). Fix the attribute clean up by always checking the state of the pod's container before inspecting its container ID. Jira: OSPRH-9411 --- controllers/galera_controller.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/controllers/galera_controller.go b/controllers/galera_controller.go index c740ed93..4fd8b6b6 100644 --- a/controllers/galera_controller.go +++ b/controllers/galera_controller.go @@ -197,6 +197,16 @@ func getRunningPodsMissingGcomm(ctx context.Context, pods []corev1.Pod, instance return } +// getGaleraContainerID retrieves the ContainerID of the galera container running in a pod +func getGaleraContainerID(pod *corev1.Pod) (found bool, CID string) { + for _, container := range pod.Status.ContainerStatuses { + if container.Name == "galera" { + return true, container.ContainerID + } + } + return false, "" +} + // isGaleraContainerStartedAndWaiting checks whether the galera container is waiting for a gcomm_uri file func isGaleraContainerStartedAndWaiting(ctx context.Context, pod *corev1.Pod, instance *mariadbv1.Galera, h *helper.Helper, config *rest.Config) bool { waiting := false @@ -282,14 +292,14 @@ func assertPodsAttributesValidity(helper *helper.Helper, instance *mariadbv1.Gal // A node can have various attributes depending on its known state. // A ContainerID attribute is only present if the node is being started. attrCID := instance.Status.Attributes[pod.Name].ContainerID - podCID := pod.Status.ContainerStatuses[0].ContainerID - if attrCID != "" && attrCID != podCID { + containerFound, podCID := getGaleraContainerID(&pod) + if !containerFound || (attrCID != "" && attrCID != podCID) { // This gcomm URI was pushed in a pod which was restarted // before the attribute got cleared, which means the pod // failed to start galera. Clear the attribute here, and // reprobe the pod's state in the next reconcile loop clearPodAttributes(instance, pod.Name) - util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "current pod ID", podCID, "recorded ID", attrCID) + util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "recorded ID", attrCID) } } }