From b1de748a3d154bc28a024bdc80222ea0d49d82f5 Mon Sep 17 00:00:00 2001 From: MegaByte875 Date: Mon, 22 Jul 2024 10:09:57 +0800 Subject: [PATCH] fix local pv failover bug (#514) --- apis/apps/v1alpha1/nebulacluster_types.go | 1 - apis/apps/v1alpha1/zz_generated.deepcopy.go | 9 +- .../nebula-operator/crds/nebulaclusters.yaml | 6 -- .../apps.nebula-graph.io_nebulaclusters.yaml | 6 -- pkg/controller/component/storaged_cluster.go | 2 +- pkg/controller/component/storaged_failover.go | 95 ++----------------- 6 files changed, 15 insertions(+), 104 deletions(-) diff --git a/apis/apps/v1alpha1/nebulacluster_types.go b/apis/apps/v1alpha1/nebulacluster_types.go index 03e38a23..22a420a5 100644 --- a/apis/apps/v1alpha1/nebulacluster_types.go +++ b/apis/apps/v1alpha1/nebulacluster_types.go @@ -185,7 +185,6 @@ type EmptyStruct struct{} type FailureHost struct { Host string `json:"host,omitempty"` PVCSet map[types.UID]EmptyStruct `json:"pvcSet,omitempty"` - HostDeleted *bool `json:"hostDeleted,omitempty"` DataBalanced *bool `json:"dataBalanced,omitempty"` PodRestarted bool `json:"podRestarted,omitempty"` PodRebuilt bool `json:"podRebuilt,omitempty"` diff --git a/apis/apps/v1alpha1/zz_generated.deepcopy.go b/apis/apps/v1alpha1/zz_generated.deepcopy.go index b4bdb4d0..7096e456 100644 --- a/apis/apps/v1alpha1/zz_generated.deepcopy.go +++ b/apis/apps/v1alpha1/zz_generated.deepcopy.go @@ -498,11 +498,6 @@ func (in *FailureHost) DeepCopyInto(out *FailureHost) { (*out)[key] = val } } - if in.HostDeleted != nil { - in, out := &in.HostDeleted, &out.HostDeleted - *out = new(bool) - **out = **in - } if in.DataBalanced != nil { in, out := &in.DataBalanced, &out.DataBalanced *out = new(bool) @@ -1455,6 +1450,10 @@ func (in *StoragedStatus) DeepCopyInto(out *StoragedStatus) { *out = make([]int32, len(*in)) copy(*out, *in) } + if in.LastBalancedTime != nil { + in, out := &in.LastBalancedTime, &out.LastBalancedTime + *out = (*in).DeepCopy() + } if in.LastBalanceJob != nil { in, out := &in.LastBalanceJob, &out.LastBalanceJob *out = new(BalanceJob) diff --git a/charts/nebula-operator/crds/nebulaclusters.yaml b/charts/nebula-operator/crds/nebulaclusters.yaml index 90931db1..703f3cf1 100644 --- a/charts/nebula-operator/crds/nebulaclusters.yaml +++ b/charts/nebula-operator/crds/nebulaclusters.yaml @@ -11799,8 +11799,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: @@ -11890,8 +11888,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: @@ -11989,8 +11985,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: diff --git a/config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml b/config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml index 90931db1..703f3cf1 100644 --- a/config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml +++ b/config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml @@ -11799,8 +11799,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: @@ -11890,8 +11888,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: @@ -11989,8 +11985,6 @@ spec: type: string host: type: string - hostDeleted: - type: boolean nodeDown: type: boolean podRebuilt: diff --git a/pkg/controller/component/storaged_cluster.go b/pkg/controller/component/storaged_cluster.go index 87d07354..e79e0f67 100644 --- a/pkg/controller/component/storaged_cluster.go +++ b/pkg/controller/component/storaged_cluster.go @@ -589,7 +589,7 @@ func (c *storagedCluster) shouldRecover(nc *v1alpha1.NebulaCluster) (bool, []str for _, host := range hostItems { podName, ok := m[host.HostAddr.Host] fh, exists := nc.Status.Storaged.FailureHosts[podName] - balanced := pointer.BoolDeref(fh.DataBalanced, true) + balanced := pointer.BoolDeref(fh.DataBalanced, false) if ok && host.Status == meta.HostStatus_ONLINE && host.HostAddr.Port == thriftPort { if exists && len(spaces) > 0 && !balanced { continue diff --git a/pkg/controller/component/storaged_failover.go b/pkg/controller/component/storaged_failover.go index 35954e0a..4fbec4ca 100644 --- a/pkg/controller/component/storaged_failover.go +++ b/pkg/controller/component/storaged_failover.go @@ -21,7 +21,6 @@ import ( "fmt" "time" - nebulago "github.com/vesoft-inc/nebula-go/v3/nebula" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" @@ -74,12 +73,16 @@ func (s *storagedFailover) Failover(nc *v1alpha1.NebulaCluster) error { if err != nil { return err } - if len(spaces) == 0 { - return utilerrors.ReconcileErrorf("storaged pods [%v] are ready after restarted", readyPods) + if len(spaces) > 0 { + for _, podName := range readyPods { + fh, ok := nc.Status.Storaged.FailureHosts[podName] + if ok { + fh.DataBalanced = pointer.Bool(true) + nc.Status.Storaged.FailureHosts[podName] = fh + } + } } - } - if err := s.deleteFailureHost(nc); err != nil { - return err + return utilerrors.ReconcileErrorf("storaged pods [%v] are ready after restarted", readyPods) } if err := s.deleteFailurePodAndPVC(nc); err != nil { return err @@ -156,84 +159,6 @@ func (s *storagedFailover) toleratePods(nc *v1alpha1.NebulaCluster) ([]string, e return readyPods, nil } -func (s *storagedFailover) deleteFailureHost(nc *v1alpha1.NebulaCluster) error { - ns := nc.GetNamespace() - componentName := nc.StoragedComponent().GetName() - options, err := nebula.ClientOptions(nc, nebula.SetIsMeta(true)) - if err != nil { - return err - } - endpoints := []string{nc.GetMetadThriftConnAddress()} - metaClient, err := nebula.NewMetaClient(endpoints, options...) - if err != nil { - return err - } - defer func() { - err := metaClient.Disconnect() - if err != nil { - klog.Errorf("meta client disconnect failed: %v", err) - } - }() - - hosts := make([]*nebulago.HostAddr, 0) - for _, fh := range nc.Status.Storaged.FailureHosts { - if pointer.BoolDeref(fh.HostDeleted, false) { - continue - } - count, err := metaClient.GetLeaderCount(fh.Host) - if err != nil { - klog.Errorf("storaged host %s get leader count failed: %v", fh.Host, err) - return err - } - if count > 0 { - return utilerrors.ReconcileErrorf("waiting for storaged host %s peers leader election done", fh.Host) - } - hosts = append(hosts, &nebulago.HostAddr{ - Host: fh.Host, - Port: nc.StoragedComponent().GetPort(v1alpha1.StoragedPortNameThrift), - }) - } - if len(hosts) == 0 { - return nil - } - - spaces, err := metaClient.ListSpaces() - if err != nil { - return err - } - if len(spaces) == 0 { - for podName, fh := range nc.Status.Storaged.FailureHosts { - fh.HostDeleted = pointer.Bool(true) - nc.Status.Storaged.FailureHosts[podName] = fh - } - return utilerrors.ReconcileErrorf("try to remove storaged cluster [%s/%s] failure host for recovery", ns, componentName) - } - - if nc.Status.Storaged.RemovedSpaces == nil { - nc.Status.Storaged.RemovedSpaces = make([]int32, 0, len(spaces)) - } - for _, space := range spaces { - if contains(nc.Status.Storaged.RemovedSpaces, *space.Id.SpaceID) { - continue - } - if err := removeHost(s.clientSet, metaClient, nc, *space.Id.SpaceID, hosts); err != nil { - klog.Errorf("storaged cluster [%s/%s] remove failure hosts %v failed: %v", ns, componentName, hosts, err) - return err - } - klog.Infof("storaged cluster [%s/%s] remove failure hosts %v in the space %s successfully", ns, componentName, hosts, space.Name) - } - - for podName, fh := range nc.Status.Storaged.FailureHosts { - fh.HostDeleted = pointer.Bool(true) - fh.DataBalanced = pointer.Bool(false) - nc.Status.Storaged.FailureHosts[podName] = fh - } - - nc.Status.Storaged.RemovedSpaces = nil - nc.Status.Storaged.LastBalanceJob = nil - return utilerrors.ReconcileErrorf("try to remove storaged cluster [%s/%s] failure host for recovery", ns, componentName) -} - func (s *storagedFailover) deleteFailurePodAndPVC(nc *v1alpha1.NebulaCluster) error { cl := label.New().Cluster(nc.GetClusterName()).Storaged() for podName, fh := range nc.Status.Storaged.FailureHosts { @@ -314,7 +239,7 @@ func (s *storagedFailover) checkPendingPod(nc *v1alpha1.NebulaCluster) error { func (s *storagedFailover) balanceData(nc *v1alpha1.NebulaCluster) error { podNames := make([]string, 0) for podName, fh := range nc.Status.Storaged.FailureHosts { - if pointer.BoolDeref(fh.DataBalanced, true) { + if pointer.BoolDeref(fh.DataBalanced, false) { continue } pod, err := s.clientSet.Pod().GetPod(nc.Namespace, podName)