Skip to content

Commit

Permalink
Merge pull request #775 from lyzuiui/dev-node-agent-heartbeat
Browse files Browse the repository at this point in the history
globalnode-server-heartbeat
  • Loading branch information
duanmengkk authored Dec 25, 2024
2 parents 5ff0bd6 + 1d63c27 commit 7daed75
Show file tree
Hide file tree
Showing 5 changed files with 431 additions and 0 deletions.
8 changes: 8 additions & 0 deletions cmd/kubenest/operator/app/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,14 @@ func run(ctx context.Context, config *config.Config) error {
return fmt.Errorf("error starting %s: %v", constants.GlobalNodeControllerName, err)
}

GlobalNodeConditionController := glnodecontroller.NewGlobalNodeStatusController(
mgr.GetClient(),
kosmosClient,
)
if err := mgr.Add(GlobalNodeConditionController); err != nil {
return fmt.Errorf("error starting %s: %v", glnodecontroller.GlobalNodeStatusControllerName, err)
}

if err := startEndPointsControllers(mgr); err != nil {
return err
}
Expand Down
3 changes: 3 additions & 0 deletions deploy/crds/kosmos.io_globalnodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ spec:
- jsonPath: .spec.nodeIP
name: NODE_IP
type: string
- jsonPath: .status.conditions[0].type
name: TYPE
type: string
- jsonPath: .spec.state
name: STATE
type: string
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/kosmos/v1alpha1/global_node_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
// +kubebuilder:subresource:status
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// +kubebuilder:printcolumn:name="NODE_IP",type=string,JSONPath=`.spec.nodeIP`
// +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.status.conditions[0].type`
// +kubebuilder:printcolumn:name="STATE",type=string,JSONPath=`.spec.state`
// +kubebuilder:printcolumn:name="VIRTUAL_CLUSTER",type=string,JSONPath=`.status.virtualCluster`

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package globalnodecontroller

import (
"context"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kosmos.io/kosmos/pkg/apis/kosmos/v1alpha1"
"github.com/kosmos.io/kosmos/pkg/generated/clientset/versioned"
)

const (
GlobalNodeStatusControllerName = "global-node-status-controller"
DefaultStatusUpdateInterval = 15 * time.Second
ClientHeartbeatThreshold = 10 * time.Second
)

type GlobalNodeStatusController struct {
root client.Client
statusInterval time.Duration

kosmosClient versioned.Interface
}

func NewGlobalNodeStatusController(
root client.Client,
kosmosClient versioned.Interface,
) *GlobalNodeStatusController {
return &GlobalNodeStatusController{
root: root,
statusInterval: DefaultStatusUpdateInterval,
kosmosClient: kosmosClient,
}
}
func (c *GlobalNodeStatusController) Start(ctx context.Context) error {
go wait.UntilWithContext(ctx, c.syncGlobalNodeStatus, c.statusInterval)

<-ctx.Done()
return nil
}
func (c *GlobalNodeStatusController) syncGlobalNodeStatus(ctx context.Context) {
globalNodes := make([]*v1alpha1.GlobalNode, 0)
//c.globalNodeLock.Lock()
//defer c.globalNodeLock.Unlock()

nodeList, err := c.kosmosClient.KosmosV1alpha1().GlobalNodes().List(ctx, metav1.ListOptions{})
if err != nil {
klog.Errorf("Failed to fetch GlobalNodes: %v", err)
return
}
for _, node := range nodeList.Items {
nodeCopy := node.DeepCopy()
globalNodes = append(globalNodes, nodeCopy)
}

err = c.updateGlobalNodeStatus(ctx, globalNodes)
if err != nil {
klog.Errorf("Failed to sync global node status: %v", err)
}
}

func (c *GlobalNodeStatusController) updateGlobalNodeStatus(
ctx context.Context,
globalNodes []*v1alpha1.GlobalNode,
) error {
for _, globalNode := range globalNodes {
err := c.updateStatusForGlobalNode(ctx, globalNode)
if err != nil {
klog.Errorf("Failed to update status for global node %s: %v", globalNode.Name, err)
return err
}
}
return nil
}

func (c *GlobalNodeStatusController) updateStatusForGlobalNode(
ctx context.Context,
globalNode *v1alpha1.GlobalNode,
) error {
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
currentNode, err := c.kosmosClient.KosmosV1alpha1().GlobalNodes().Get(ctx, globalNode.Name, metav1.GetOptions{})
if err != nil {
klog.Errorf("Failed to fetch the latest GlobalNode %s: %v", globalNode.Name, err)
return err
}

if len(currentNode.Status.Conditions) == 0 {
klog.Warningf("GlobalNode %s has no conditions, skipping status update", currentNode.Name)
return nil
}

condition := currentNode.Status.Conditions[0]
lastHeartbeatTime := condition.LastHeartbeatTime
timeDiff := time.Since(lastHeartbeatTime.Time)

statusType := "Ready"
if timeDiff > ClientHeartbeatThreshold {
statusType = "NotReady"
}

if string(condition.Type) != statusType {
condition.Type = v1.NodeConditionType(statusType)
condition.LastTransitionTime = metav1.NewTime(time.Now())

currentNode.Status.Conditions[0] = condition

_, err = c.kosmosClient.KosmosV1alpha1().GlobalNodes().UpdateStatus(ctx, currentNode, metav1.UpdateOptions{})
if err != nil {
if errors.IsConflict(err) {
klog.Warningf("Conflict detected while updating status for GlobalNode %s, retrying...", globalNode.Name)
} else {
klog.Errorf("Failed to update status for GlobalNode %s: %v", globalNode.Name, err)
}
return err
}

klog.Infof("Successfully updated status for GlobalNode %s to %s", globalNode.Name, statusType)
} else {
klog.Infof("No status update required for GlobalNode %s, current status: %s", globalNode.Name, condition.Type)
}

return nil
})
}
Loading

0 comments on commit 7daed75

Please sign in to comment.