Skip to content

Commit

Permalink
Adds TESTING node label during invasive checks
Browse files Browse the repository at this point in the history
Signed-off-by: Jim Cadden <[email protected]>
  • Loading branch information
jimcadden committed Aug 30, 2024
1 parent b25afca commit 16a7287
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 9 deletions.
13 changes: 7 additions & 6 deletions autopilot-daemon/pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ func main() {

hcMux := http.NewServeMux()

hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold))
hcMux.Handle("/remapped", handlers.RemappedRowsHandler())
hcMux.Handle("/status", handlers.SystemStatusHandler())
hcMux.Handle("/dcgm", handlers.DCGMHandler())
hcMux.Handle("/gpumem", handlers.GpuMemHandler())
hcMux.Handle("/gpupower", handlers.GpuPowerHandler())
hcMux.Handle("/invasive", handlers.InvasiveCheckTimer())
hcMux.Handle("/iperf", handlers.IperfHandler())
hcMux.Handle("/iperfservers", handlers.StartIperfServersHandler())
hcMux.Handle("/dcgm", handlers.DCGMHandler())
hcMux.Handle("/pciebw", handlers.PCIeBWHandler(utils.UserConfig.BWThreshold))
hcMux.Handle("/ping", handlers.PingHandler())
hcMux.Handle("/gpupower", handlers.GpuPowerHandler())
hcMux.Handle("/gpumem", handlers.GpuMemHandler())
hcMux.Handle("/pvc", handlers.PVCHandler())
hcMux.Handle("/remapped", handlers.RemappedRowsHandler())
hcMux.Handle("/status", handlers.SystemStatusHandler())

s := &http.Server{
Addr: ":" + *port,
Expand Down
28 changes: 26 additions & 2 deletions autopilot-daemon/pkg/handlers/healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,35 @@ func PeriodicCheckTimer() {
}

func InvasiveCheckTimer() {
klog.Info("Trying to run an intrusive check")
klog.Info("Trying to run an invasive check")
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
if utils.GPUsAvailability() {
utils.CreateJob("dcgm")
klog.Info("Begining invasive health checks, updating node label =TESTING for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": "TESTING"
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
err := utils.CreateJob("dcgm")
if err != nil {
klog.Info("Invasive health checks failed, updating node label for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": ""
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion autopilot-daemon/pkg/utils/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func GPUsAvailability() bool {
return true
}

func CreateJob(healthcheck string) {
func CreateJob(healthcheck string) error {
var args []string
var cmd []string
switch healthcheck {
Expand All @@ -114,12 +114,14 @@ func CreateJob(healthcheck string) {
fieldselector, err := fields.ParseSelector("metadata.name=" + os.Getenv("POD_NAME"))
if err != nil {
klog.Info("Error in creating the field selector", err.Error())
return err
}
pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldselector.String(),
})
if err != nil {
klog.Info("Cannot get pod:", err.Error())
return err
}
autopilotPod := pods.Items[0]
ttlsec := int32(30) // setting TTL to 30 sec
Expand Down Expand Up @@ -178,8 +180,10 @@ func CreateJob(healthcheck string) {
metav1.CreateOptions{})
if err != nil {
klog.Info("Couldn't create Job ", err.Error())
return err
}
klog.Info("Created")
return nil
}

func CreatePVC() error {
Expand Down

0 comments on commit 16a7287

Please sign in to comment.