Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nfd-topology-updater: Add liveness probe #1643

Merged
merged 1 commit into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
// ProgramName is the canonical name of this program
ProgramName = "nfd-topology-updater"
kubeletSecurePort = 10250
GrpcHealthPort = 8082
)

var DefaultKubeletStateDir = path.Join(string(hostpath.VarDir), "lib", "kubelet")
Expand All @@ -54,6 +55,7 @@
utils.ConfigureGrpcKlog()

// Get new TopologyUpdater instance
args.GrpcHealthPort = GrpcHealthPort

Check warning on line 58 in cmd/nfd-topology-updater/main.go

View check run for this annotation

Codecov / codecov/patch

cmd/nfd-topology-updater/main.go#L58

Added line #L58 was not covered by tests
instance, err := topology.NewTopologyUpdater(*args, *resourcemonitorArgs)
if err != nil {
klog.ErrorS(err, "failed to initialize topology updater instance")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ spec:
- name: nfd-topology-updater
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
command:
- "nfd-topology-updater"
args: []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ spec:
- name: topology-updater
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
env:
- name: NODE_NAME
valueFrom:
Expand Down
14 changes: 14 additions & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,20 @@ topologyUpdater:
readOnlyRootFilesystem: true
runAsUser: 0

# livenessProbe: {}
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
# grpc:
# port: 8082
# initialDelaySeconds: 10
# periodSeconds: 10
# readinessProbe: {}
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
# grpc:
# port: 8082
# initialDelaySeconds: 5
# periodSeconds: 10
# failureThreshold: 10

resources:
limits:
cpu: 100m
Expand Down
44 changes: 44 additions & 0 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@

import (
"fmt"
"net"
"net/url"
"os"
"path/filepath"

"golang.org/x/net/context"

"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -58,6 +62,7 @@
KubeConfigFile string
ConfigFile string
KubeletStateDir string
GrpcHealthPort int

Klog map[string]*utils.KlogFlagVal
}
Expand Down Expand Up @@ -85,6 +90,7 @@
ownerRefs []metav1.OwnerReference
k8sClient k8sclient.Interface
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
healthServer *grpc.Server
}

// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
Expand Down Expand Up @@ -128,6 +134,29 @@
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
}

func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

Check warning on line 141 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L137-L141

Added lines #L137 - L141 were not covered by tests

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")

Check warning on line 154 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L143-L154

Added lines #L143 - L154 were not covered by tests
}()
w.healthServer = s
return nil

Check warning on line 157 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L156-L157

Added lines #L156 - L157 were not covered by tests
}

// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
// one request if OneShot is set to 'true' in the updater args.
func (w *nfdTopologyUpdater) Run() error {
Expand Down Expand Up @@ -187,8 +216,20 @@
return fmt.Errorf("failed to obtain node resource information: %w", err)
}

grpcErr := make(chan error, 1)

// Start gRPC server for liveness probe (at this point we're "live")
if w.args.GrpcHealthPort != 0 {
if err := w.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}

Check warning on line 225 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L219-L225

Added lines #L219 - L225 were not covered by tests
}

for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

Check warning on line 231 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L230-L231

Added lines #L230 - L231 were not covered by tests

case info := <-w.eventSource:
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
scanResponse, err := resScan.Scan()
Expand Down Expand Up @@ -217,6 +258,9 @@

case <-w.stop:
klog.InfoS("shutting down nfd-topology-updater")
if w.healthServer != nil {
w.healthServer.GracefulStop()
}

Check warning on line 263 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L261-L263

Added lines #L261 - L263 were not covered by tests
return nil
}
}
Expand Down