Skip to content

Commit

Permalink
Merge pull request kubernetes-sigs#1535 from marquiz/devel/grpc-probe
Browse files Browse the repository at this point in the history
nfd-master: run a separate gRPC health server
  • Loading branch information
k8s-ci-robot authored Jan 5, 2024
2 parents 64eba87 + a053efd commit 4501bed
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
4 changes: 3 additions & 1 deletion cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ import (

const (
// ProgramName is the canonical name of this program
ProgramName = "nfd-master"
ProgramName = "nfd-master"
GrpcHealthPort = 8082
)

func main() {
Expand Down Expand Up @@ -100,6 +101,7 @@ func main() {
utils.ConfigureGrpcKlog()

// Get new NfdMaster instance
args.GrpcHealthPort = GrpcHealthPort
instance, err := master.NewNfdMaster(args)
if err != nil {
klog.ErrorS(err, "failed to initialize NfdMaster instance")
Expand Down
6 changes: 2 additions & 4 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ spec:
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -37,5 +37,3 @@ spec:
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080
4 changes: 2 additions & 2 deletions deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand Down
57 changes: 49 additions & 8 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ type Args struct {
CrdController bool
EnableNodeFeatureApi bool
Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
Prune bool
VerifyNodeName bool
Options string
Expand Down Expand Up @@ -144,6 +147,7 @@ type nfdMaster struct {
nodeName string
configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
Expand Down Expand Up @@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {

// Run gRPC server
grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr)
// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
go m.runGrpcServer(grpcErr)
}

// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
Expand All @@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
}
}

// Start gRPC server for liveness probe (at this point we're "live")
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}

// Notify that we're ready to accept connections
m.ready <- true
close(m.ready)
Expand Down Expand Up @@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
}
}

// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
m.healthServer = s
return nil
}

func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
// Create server listening for TCP connections
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
Expand Down Expand Up @@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
}
m.server = grpc.NewServer(serverOpts...)

// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
pb.RegisterLabelerServer(m.server, m)
}
pb.RegisterLabelerServer(m.server, m)

grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
klog.InfoS("gRPC server serving", "port", m.args.Port)

// Run gRPC server
Expand Down Expand Up @@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {

// Stop NfdMaster
func (m *nfdMaster) Stop() {
m.server.GracefulStop()
if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}

if m.nfdController != nil {
m.nfdController.stop()
Expand Down

0 comments on commit 4501bed

Please sign in to comment.