Skip to content

Commit

Permalink
Adds TESTING node label during invasive checks
Browse files Browse the repository at this point in the history
Signed-off-by: Jim Cadden <[email protected]>
  • Loading branch information
jimcadden committed Sep 12, 2024
1 parent b25afca commit 24c0d06
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 14 deletions.
19 changes: 10 additions & 9 deletions autopilot-daemon/pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ func main() {

hcMux := http.NewServeMux()

hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold))
hcMux.Handle("/remapped", handlers.RemappedRowsHandler())
hcMux.Handle("/status", handlers.SystemStatusHandler())
hcMux.Handle("/dcgm", handlers.DCGMHandler())
hcMux.Handle("/gpumem", handlers.GpuMemHandler())
hcMux.Handle("/gpupower", handlers.GpuPowerHandler())
hcMux.Handle("/iperf", handlers.IperfHandler())
hcMux.Handle("/iperfservers", handlers.StartIperfServersHandler())
hcMux.Handle("/dcgm", handlers.DCGMHandler())
hcMux.Handle("/invasive", handlers.InvasiveCheckHandler())
hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold))
hcMux.Handle("/ping", handlers.PingHandler())
hcMux.Handle("/gpupower", handlers.GpuPowerHandler())
hcMux.Handle("/gpumem", handlers.GpuMemHandler())
hcMux.Handle("/pvc", handlers.PVCHandler())
hcMux.Handle("/remapped", handlers.RemappedRowsHandler())
hcMux.Handle("/status", handlers.SystemStatusHandler())

s := &http.Server{
Addr: ":" + *port,
Expand Down Expand Up @@ -116,7 +117,7 @@ func main() {
go utils.WatchNode()

// Run the health checks at startup, then start the timer
handlers.PeriodicCheckTimer()
handlers.PeriodicCheck()

periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour)
defer periodicChecksTicker.Stop()
Expand All @@ -125,10 +126,10 @@ func main() {
for {
select {
case <-periodicChecksTicker.C:
handlers.PeriodicCheckTimer()
handlers.PeriodicCheck()
case <-invasiveChecksTicker.C:
if *invasive > 0 {
handlers.InvasiveCheckTimer()
handlers.InvasiveCheck()
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions autopilot-daemon/pkg/handlers/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ func PingHandler() http.Handler {
return http.HandlerFunc(fn)
}

func InvasiveCheckHandler() http.Handler {
fn := func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("Launching invasive health checks. Results will added to 'autopilot.ibm.com/gpuhealth' node label"))
InvasiveCheck()
}
return http.HandlerFunc(fn)
}

func IperfHandler() http.Handler {
fn := func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("Iperf3 test"))
Expand Down
32 changes: 28 additions & 4 deletions autopilot-daemon/pkg/handlers/healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,44 @@ import (
"k8s.io/klog/v2"
)

func PeriodicCheckTimer() {
func PeriodicCheck() {
klog.Info("Running a periodic check")
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
checks := utils.GetPeriodicChecks()
runAllTestsLocal("all", checks, "1", "None", "None", nil)
}

func InvasiveCheckTimer() {
klog.Info("Trying to run an intrusive check")
func InvasiveCheck() {
klog.Info("Trying to run an invasive check")
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
if utils.GPUsAvailability() {
utils.CreateJob("dcgm")
klog.Info("Begining invasive health checks, updating node label =TESTING for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": "TESTING"
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
err := utils.CreateJob("dcgm")
if err != nil {
klog.Info("Invasive health checks failed, updating node label for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": ""
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion autopilot-daemon/pkg/utils/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func GPUsAvailability() bool {
return true
}

func CreateJob(healthcheck string) {
func CreateJob(healthcheck string) error {
var args []string
var cmd []string
switch healthcheck {
Expand All @@ -114,12 +114,14 @@ func CreateJob(healthcheck string) {
fieldselector, err := fields.ParseSelector("metadata.name=" + os.Getenv("POD_NAME"))
if err != nil {
klog.Info("Error in creating the field selector", err.Error())
return err
}
pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldselector.String(),
})
if err != nil {
klog.Info("Cannot get pod:", err.Error())
return err
}
autopilotPod := pods.Items[0]
ttlsec := int32(30) // setting TTL to 30 sec
Expand Down Expand Up @@ -178,8 +180,10 @@ func CreateJob(healthcheck string) {
metav1.CreateOptions{})
if err != nil {
klog.Info("Couldn't create Job ", err.Error())
return err
}
klog.Info("Created")
return nil
}

func CreatePVC() error {
Expand Down

0 comments on commit 24c0d06

Please sign in to comment.