Skip to content

Commit

Permalink
Add probes for liveness and readiness
Browse files Browse the repository at this point in the history
Signed-off-by: Claudia <[email protected]>
  • Loading branch information
cmisale committed Aug 23, 2024
1 parent 94757de commit b25afca
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 12 deletions.
2 changes: 1 addition & 1 deletion autopilot-daemon/helm-charts/autopilot/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v2
name: autopilot-daemon
name: autopilot
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
Expand Down
26 changes: 24 additions & 2 deletions autopilot-daemon/helm-charts/autopilot/templates/autopilot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: DaemonSet
metadata:
labels:
app: autopilot
name: {{ printf "%s-%s" .Chart.Name .Release.Name }}
name: {{ printf "%s" .Chart.Name }}
namespace: {{ .Values.namespace.name }}
spec:
selector:
Expand Down Expand Up @@ -56,7 +56,7 @@ spec:
- -c
- |
iperf3 -s -p 6310 -D
/usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --intrusive-check-timer {{ .Values.intrusive }}
/usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --invasive-check-timer {{ .Values.invasive }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
name: autopilot
securityContext:
Expand Down Expand Up @@ -87,6 +87,28 @@ spec:
name: healthcheck
- containerPort: 8081
name: http
- containerPort: 8080
name: readinessprobe
readinessProbe:
httpGet:
path: /readinessprobe
port: 8080
initialDelaySeconds: 15
periodSeconds: 120
timeoutSeconds: 10
livenessProbe:
initialDelaySeconds: 15
periodSeconds: 120
timeoutSeconds: 15
{{- if .Values.onlyOnGPUNodes }}
exec:
command:
- nvidia-smi
{{- else }}
httpGet:
path: /readinessprobe
port: 8080
{{- end}}
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
Expand Down
16 changes: 15 additions & 1 deletion autopilot-daemon/helm-charts/autopilot/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,24 @@ metadata:
annotations:
{{- toYaml .Values.serviceAnnotations | nindent 4 }}
spec:
# type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
protocol: TCP
name: healthcheck
selector:
app: autopilot
---
apiVersion: v1
kind: Service
metadata:
labels:
app: autopilot
name: autopilot-readinessprobe
namespace: {{ .Values.namespace.name }}
spec:
ports:
- port: 8080
protocol: TCP
name: readinessprobe
selector:
app: autopilot
2 changes: 1 addition & 1 deletion autopilot-daemon/helm-charts/autopilot/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ PCIeBW: 4
repeat: 1

# Timer for periodic invasive checks, in hours (e.g., dcgmi diag -r 3). Set to 0 to disable (for non nvidia gpu systems)
intrusive: 4
invasive: 4

# Image pull secret if the image is in a private repository
pullSecrets:
Expand Down
24 changes: 18 additions & 6 deletions autopilot-daemon/pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func main() {
logFile := flag.String("logfile", "", "File where to save all the events")
v := flag.String("loglevel", "2", "Log level")
repeat := flag.Int("w", 24, "Run all tests periodically on each node. Time set in hours. Defaults to 24h")
intrusive := flag.Int("intrusive-check-timer", 4, "Run intrusive checks (e.g., dcgmi level 3) on each node when GPUs are free. Time set in hours. Defaults to 4h. Set to 0 to avoid intrusive checks")
invasive := flag.Int("invasive-check-timer", 4, "Run invasive checks (e.g., dcgmi level 3) on each node when GPUs are free. Time set in hours. Defaults to 4h. Set to 0 to avoid invasive checks")

flag.Parse()

Expand Down Expand Up @@ -58,6 +58,18 @@ func main() {
}
}()

readinessMux := http.NewServeMux()
readinessMux.Handle("/readinessprobe", handlers.ReadinessProbeHandler())

go func() {
klog.Info("Serving Readiness Probe on :8080")
err := http.ListenAndServe(":8080", readinessMux)
if err != nil {
klog.Error(err.Error())
os.Exit(1)
}
}()

hcMux := http.NewServeMux()

hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold))
Expand Down Expand Up @@ -108,15 +120,15 @@ func main() {

periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour)
defer periodicChecksTicker.Stop()
intrusiveChecksTicker := time.NewTicker(time.Duration(*intrusive) * time.Hour)
defer intrusiveChecksTicker.Stop()
invasiveChecksTicker := time.NewTicker(time.Duration(*invasive) * time.Hour)
defer invasiveChecksTicker.Stop()
for {
select {
case <-periodicChecksTicker.C:
handlers.PeriodicCheckTimer()
case <-intrusiveChecksTicker.C:
if *intrusive > 0 {
handlers.IntrusiveCheckTimer()
case <-invasiveChecksTicker.C:
if *invasive > 0 {
handlers.InvasiveCheckTimer()
}
}
}
Expand Down
11 changes: 11 additions & 0 deletions autopilot-daemon/pkg/handlers/handler.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package handlers

import (
"encoding/json"
"net/http"
"os"
"strings"
Expand Down Expand Up @@ -262,3 +263,13 @@ func PVCHandler() http.Handler {
}
return http.HandlerFunc(fn)
}

func ReadinessProbeHandler() http.Handler {
fn := func(w http.ResponseWriter, r *http.Request) {
data := HealthResult{"readinessProbe", "ready"}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(data)
}
return http.HandlerFunc(fn)
}
2 changes: 1 addition & 1 deletion autopilot-daemon/pkg/handlers/healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func PeriodicCheckTimer() {
runAllTestsLocal("all", checks, "1", "None", "None", nil)
}

func IntrusiveCheckTimer() {
func InvasiveCheckTimer() {
klog.Info("Trying to run an intrusive check")
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
Expand Down
6 changes: 6 additions & 0 deletions autopilot-daemon/pkg/handlers/messagestruct.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package handlers

type HealthResult struct {
Name string
Body string
}

0 comments on commit b25afca

Please sign in to comment.