diff --git a/autopilot-daemon/pkg/cmd/main.go b/autopilot-daemon/pkg/cmd/main.go index 0d0ceec..5205155 100644 --- a/autopilot-daemon/pkg/cmd/main.go +++ b/autopilot-daemon/pkg/cmd/main.go @@ -72,16 +72,17 @@ func main() { hcMux := http.NewServeMux() - hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold)) - hcMux.Handle("/remapped", handlers.RemappedRowsHandler()) - hcMux.Handle("/status", handlers.SystemStatusHandler()) + hcMux.Handle("/dcgm", handlers.DCGMHandler()) + hcMux.Handle("/gpumem", handlers.GpuMemHandler()) + hcMux.Handle("/gpupower", handlers.GpuPowerHandler()) hcMux.Handle("/iperf", handlers.IperfHandler()) hcMux.Handle("/iperfservers", handlers.StartIperfServersHandler()) - hcMux.Handle("/dcgm", handlers.DCGMHandler()) + hcMux.Handle("/invasive", handlers.InvasiveCheckHandler()) + hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold)) hcMux.Handle("/ping", handlers.PingHandler()) - hcMux.Handle("/gpupower", handlers.GpuPowerHandler()) - hcMux.Handle("/gpumem", handlers.GpuMemHandler()) hcMux.Handle("/pvc", handlers.PVCHandler()) + hcMux.Handle("/remapped", handlers.RemappedRowsHandler()) + hcMux.Handle("/status", handlers.SystemStatusHandler()) s := &http.Server{ Addr: ":" + *port, @@ -116,7 +117,7 @@ func main() { go utils.WatchNode() // Run the health checks at startup, then start the timer - handlers.PeriodicCheckTimer() + handlers.PeriodicCheck() periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour) defer periodicChecksTicker.Stop() @@ -125,10 +126,10 @@ func main() { for { select { case <-periodicChecksTicker.C: - handlers.PeriodicCheckTimer() + handlers.PeriodicCheck() case <-invasiveChecksTicker.C: if *invasive > 0 { - handlers.InvasiveCheckTimer() + handlers.InvasiveCheck() } } } diff --git a/autopilot-daemon/pkg/handlers/handler.go b/autopilot-daemon/pkg/handlers/handler.go index 40a7346..15db4b1 100644 --- a/autopilot-daemon/pkg/handlers/handler.go +++ b/autopilot-daemon/pkg/handlers/handler.go @@ -144,6 +144,14 @@ func PingHandler() http.Handler { return http.HandlerFunc(fn) } +func InvasiveCheckHandler() http.Handler { + fn := func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("Launching invasive health checks. Results will added to 'autopilot.ibm.com/gpuhealth' node label")) + InvasiveCheck() + } + return http.HandlerFunc(fn) +} + func IperfHandler() http.Handler { fn := func(w http.ResponseWriter, r *http.Request) { w.Write([]byte("Iperf3 test")) diff --git a/autopilot-daemon/pkg/handlers/healthchecks.go b/autopilot-daemon/pkg/handlers/healthchecks.go index 5db5dbb..9d5e415 100644 --- a/autopilot-daemon/pkg/handlers/healthchecks.go +++ b/autopilot-daemon/pkg/handlers/healthchecks.go @@ -13,7 +13,7 @@ import ( "k8s.io/klog/v2" ) -func PeriodicCheckTimer() { +func PeriodicCheck() { klog.Info("Running a periodic check") utils.HealthcheckLock.Lock() defer utils.HealthcheckLock.Unlock() @@ -21,12 +21,36 @@ func PeriodicCheckTimer() { runAllTestsLocal("all", checks, "1", "None", "None", nil) } -func InvasiveCheckTimer() { - klog.Info("Trying to run an intrusive check") +func InvasiveCheck() { + klog.Info("Trying to run an invasive check") utils.HealthcheckLock.Lock() defer utils.HealthcheckLock.Unlock() if utils.GPUsAvailability() { - utils.CreateJob("dcgm") + klog.Info("Begining invasive health checks, updating node label =TESTING for node ", os.Getenv("NODE_NAME")) + label := ` + { + "metadata": { + "labels": { + "autopilot.ibm.com/gpuhealth": "TESTING" + } + } + } + ` + utils.PatchNode(label, os.Getenv("NODE_NAME")) + err := utils.CreateJob("dcgm") + if err != nil { + klog.Info("Invasive health checks failed, updating node label for node ", os.Getenv("NODE_NAME")) + label := ` + { + "metadata": { + "labels": { + "autopilot.ibm.com/gpuhealth": "" + } + } + } + ` + utils.PatchNode(label, os.Getenv("NODE_NAME")) + } } } diff --git a/autopilot-daemon/pkg/utils/functions.go b/autopilot-daemon/pkg/utils/functions.go index cd31024..88137eb 100644 --- a/autopilot-daemon/pkg/utils/functions.go +++ b/autopilot-daemon/pkg/utils/functions.go @@ -101,7 +101,7 @@ func GPUsAvailability() bool { return true } -func CreateJob(healthcheck string) { +func CreateJob(healthcheck string) error { var args []string var cmd []string switch healthcheck { @@ -114,12 +114,14 @@ func CreateJob(healthcheck string) { fieldselector, err := fields.ParseSelector("metadata.name=" + os.Getenv("POD_NAME")) if err != nil { klog.Info("Error in creating the field selector", err.Error()) + return err } pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{ FieldSelector: fieldselector.String(), }) if err != nil { klog.Info("Cannot get pod:", err.Error()) + return err } autopilotPod := pods.Items[0] ttlsec := int32(30) // setting TTL to 30 sec @@ -178,8 +180,10 @@ func CreateJob(healthcheck string) { metav1.CreateOptions{}) if err != nil { klog.Info("Couldn't create Job ", err.Error()) + return err } klog.Info("Created") + return nil } func CreatePVC() error {