Skip to content

Commit

Permalink
fix local pv failover bug (vesoft-inc#514)
Browse files Browse the repository at this point in the history
  • Loading branch information
MegaByte875 authored Jul 22, 2024
1 parent 71ce304 commit b1de748
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 104 deletions.
1 change: 0 additions & 1 deletion apis/apps/v1alpha1/nebulacluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ type EmptyStruct struct{}
type FailureHost struct {
Host string `json:"host,omitempty"`
PVCSet map[types.UID]EmptyStruct `json:"pvcSet,omitempty"`
HostDeleted *bool `json:"hostDeleted,omitempty"`
DataBalanced *bool `json:"dataBalanced,omitempty"`
PodRestarted bool `json:"podRestarted,omitempty"`
PodRebuilt bool `json:"podRebuilt,omitempty"`
Expand Down
9 changes: 4 additions & 5 deletions apis/apps/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 0 additions & 6 deletions charts/nebula-operator/crds/nebulaclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11799,8 +11799,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down Expand Up @@ -11890,8 +11888,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down Expand Up @@ -11989,8 +11985,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down
6 changes: 0 additions & 6 deletions config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11799,8 +11799,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down Expand Up @@ -11890,8 +11888,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down Expand Up @@ -11989,8 +11985,6 @@ spec:
type: string
host:
type: string
hostDeleted:
type: boolean
nodeDown:
type: boolean
podRebuilt:
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/component/storaged_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ func (c *storagedCluster) shouldRecover(nc *v1alpha1.NebulaCluster) (bool, []str
for _, host := range hostItems {
podName, ok := m[host.HostAddr.Host]
fh, exists := nc.Status.Storaged.FailureHosts[podName]
balanced := pointer.BoolDeref(fh.DataBalanced, true)
balanced := pointer.BoolDeref(fh.DataBalanced, false)
if ok && host.Status == meta.HostStatus_ONLINE && host.HostAddr.Port == thriftPort {
if exists && len(spaces) > 0 && !balanced {
continue
Expand Down
95 changes: 10 additions & 85 deletions pkg/controller/component/storaged_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"fmt"
"time"

nebulago "github.com/vesoft-inc/nebula-go/v3/nebula"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -74,12 +73,16 @@ func (s *storagedFailover) Failover(nc *v1alpha1.NebulaCluster) error {
if err != nil {
return err
}
if len(spaces) == 0 {
return utilerrors.ReconcileErrorf("storaged pods [%v] are ready after restarted", readyPods)
if len(spaces) > 0 {
for _, podName := range readyPods {
fh, ok := nc.Status.Storaged.FailureHosts[podName]
if ok {
fh.DataBalanced = pointer.Bool(true)
nc.Status.Storaged.FailureHosts[podName] = fh
}
}
}
}
if err := s.deleteFailureHost(nc); err != nil {
return err
return utilerrors.ReconcileErrorf("storaged pods [%v] are ready after restarted", readyPods)
}
if err := s.deleteFailurePodAndPVC(nc); err != nil {
return err
Expand Down Expand Up @@ -156,84 +159,6 @@ func (s *storagedFailover) toleratePods(nc *v1alpha1.NebulaCluster) ([]string, e
return readyPods, nil
}

func (s *storagedFailover) deleteFailureHost(nc *v1alpha1.NebulaCluster) error {
ns := nc.GetNamespace()
componentName := nc.StoragedComponent().GetName()
options, err := nebula.ClientOptions(nc, nebula.SetIsMeta(true))
if err != nil {
return err
}
endpoints := []string{nc.GetMetadThriftConnAddress()}
metaClient, err := nebula.NewMetaClient(endpoints, options...)
if err != nil {
return err
}
defer func() {
err := metaClient.Disconnect()
if err != nil {
klog.Errorf("meta client disconnect failed: %v", err)
}
}()

hosts := make([]*nebulago.HostAddr, 0)
for _, fh := range nc.Status.Storaged.FailureHosts {
if pointer.BoolDeref(fh.HostDeleted, false) {
continue
}
count, err := metaClient.GetLeaderCount(fh.Host)
if err != nil {
klog.Errorf("storaged host %s get leader count failed: %v", fh.Host, err)
return err
}
if count > 0 {
return utilerrors.ReconcileErrorf("waiting for storaged host %s peers leader election done", fh.Host)
}
hosts = append(hosts, &nebulago.HostAddr{
Host: fh.Host,
Port: nc.StoragedComponent().GetPort(v1alpha1.StoragedPortNameThrift),
})
}
if len(hosts) == 0 {
return nil
}

spaces, err := metaClient.ListSpaces()
if err != nil {
return err
}
if len(spaces) == 0 {
for podName, fh := range nc.Status.Storaged.FailureHosts {
fh.HostDeleted = pointer.Bool(true)
nc.Status.Storaged.FailureHosts[podName] = fh
}
return utilerrors.ReconcileErrorf("try to remove storaged cluster [%s/%s] failure host for recovery", ns, componentName)
}

if nc.Status.Storaged.RemovedSpaces == nil {
nc.Status.Storaged.RemovedSpaces = make([]int32, 0, len(spaces))
}
for _, space := range spaces {
if contains(nc.Status.Storaged.RemovedSpaces, *space.Id.SpaceID) {
continue
}
if err := removeHost(s.clientSet, metaClient, nc, *space.Id.SpaceID, hosts); err != nil {
klog.Errorf("storaged cluster [%s/%s] remove failure hosts %v failed: %v", ns, componentName, hosts, err)
return err
}
klog.Infof("storaged cluster [%s/%s] remove failure hosts %v in the space %s successfully", ns, componentName, hosts, space.Name)
}

for podName, fh := range nc.Status.Storaged.FailureHosts {
fh.HostDeleted = pointer.Bool(true)
fh.DataBalanced = pointer.Bool(false)
nc.Status.Storaged.FailureHosts[podName] = fh
}

nc.Status.Storaged.RemovedSpaces = nil
nc.Status.Storaged.LastBalanceJob = nil
return utilerrors.ReconcileErrorf("try to remove storaged cluster [%s/%s] failure host for recovery", ns, componentName)
}

func (s *storagedFailover) deleteFailurePodAndPVC(nc *v1alpha1.NebulaCluster) error {
cl := label.New().Cluster(nc.GetClusterName()).Storaged()
for podName, fh := range nc.Status.Storaged.FailureHosts {
Expand Down Expand Up @@ -314,7 +239,7 @@ func (s *storagedFailover) checkPendingPod(nc *v1alpha1.NebulaCluster) error {
func (s *storagedFailover) balanceData(nc *v1alpha1.NebulaCluster) error {
podNames := make([]string, 0)
for podName, fh := range nc.Status.Storaged.FailureHosts {
if pointer.BoolDeref(fh.DataBalanced, true) {
if pointer.BoolDeref(fh.DataBalanced, false) {
continue
}
pod, err := s.clientSet.Pod().GetPod(nc.Namespace, podName)
Expand Down

0 comments on commit b1de748

Please sign in to comment.