diff --git a/pkg/controller/component/graphd_cluster.go b/pkg/controller/component/graphd_cluster.go index cc824608..f72a1b4a 100644 --- a/pkg/controller/component/graphd_cluster.go +++ b/pkg/controller/component/graphd_cluster.go @@ -292,12 +292,16 @@ func (c *graphdCluster) syncNebulaClusterStatus( return err } thriftPort := nc.GraphdComponent().GetPort(v1alpha1.GraphdPortNameThrift) - for i := range hostItems { - host := hostItems[i] + klog.Infof("Current graphd state: %v. Current number of replicas: %v", nc.Status.Graphd.Phase, pointer.Int32Deref(newReplicas, 0)) + for _, host := range hostItems { + klog.Infof("Currently looking at host: %v with status %v", strings.Split(host.HostAddr.Host, ".")[0], host.Status) if host.Status == meta.HostStatus_OFFLINE && host.HostAddr.Port == thriftPort { podName := strings.Split(host.HostAddr.Host, ".")[0] ordinal := getPodOrdinal(podName) if int32(ordinal) >= pointer.Int32Deref(nc.Spec.Graphd.Replicas, 0) { + klog.Infof("graphd pod [%s/%s] has already been terminated by the sts. Skipping failover and/or removing from auto failover list", nc.Namespace, podName) + // delete is a no-op if FailureHosts or podName is nil + delete(nc.Status.Graphd.FailureHosts, podName) continue } if nc.Status.Graphd.FailureHosts == nil { diff --git a/pkg/controller/component/metad_cluster.go b/pkg/controller/component/metad_cluster.go index 0dbc1a5c..4a76a1f5 100644 --- a/pkg/controller/component/metad_cluster.go +++ b/pkg/controller/component/metad_cluster.go @@ -257,10 +257,16 @@ func (c *metadCluster) syncNebulaClusterStatus(nc *v1alpha1.NebulaCluster, oldWo return err } thriftPort := nc.MetadComponent().GetPort(v1alpha1.MetadPortNameThrift) - for i := range hostItems { - host := hostItems[i] + for _, host := range hostItems { if host.Status == meta.HostStatus_OFFLINE && host.HostAddr.Port == thriftPort { podName := strings.Split(host.HostAddr.Host, ".")[0] + ordinal := getPodOrdinal(podName) + if int32(ordinal) >= pointer.Int32Deref(nc.Spec.Metad.Replicas, 0) { + klog.Infof("metad pod [%s/%s] has already been terminated by the sts. Skipping failover and/or removing from auto failover list", nc.Namespace, podName) + // delete is a no-op if FailureHosts or podName is nil + delete(nc.Status.Metad.FailureHosts, podName) + continue + } if nc.Status.Metad.FailureHosts == nil { nc.Status.Metad.FailureHosts = make(map[string]v1alpha1.FailureHost) } diff --git a/pkg/controller/component/storaged_cluster.go b/pkg/controller/component/storaged_cluster.go index 177d636e..d836f433 100644 --- a/pkg/controller/component/storaged_cluster.go +++ b/pkg/controller/component/storaged_cluster.go @@ -335,12 +335,14 @@ func (c *storagedCluster) syncNebulaClusterStatus( return err } thriftPort := nc.StoragedComponent().GetPort(v1alpha1.StoragedPortNameThrift) - for i := range hostItems { - host := hostItems[i] + for _, host := range hostItems { if host.Status == meta.HostStatus_OFFLINE && host.HostAddr.Port == thriftPort { podName := strings.Split(host.HostAddr.Host, ".")[0] ordinal := getPodOrdinal(podName) if int32(ordinal) >= pointer.Int32Deref(nc.Spec.Storaged.Replicas, 0) { + klog.Infof("storaged pod [%s/%s] has already been terminated by the sts. Skipping failover and/or removing from auto failover list", nc.Namespace, podName) + // delete is a no-op if FailureHosts or podName is nil + delete(nc.Status.Storaged.FailureHosts, podName) continue } if nc.Status.Storaged.FailureHosts == nil {