Skip to content

Commit

Permalink
Adds TESTING node label during invasive checks
Browse files Browse the repository at this point in the history
Signed-off-by: Jim Cadden <[email protected]>
  • Loading branch information
jimcadden committed Aug 30, 2024
1 parent b25afca commit 68bc14a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
28 changes: 26 additions & 2 deletions autopilot-daemon/pkg/handlers/healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,35 @@ func PeriodicCheckTimer() {
}

func InvasiveCheckTimer() {
klog.Info("Trying to run an intrusive check")
klog.Info("Trying to run an invasive check")
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
if utils.GPUsAvailability() {
utils.CreateJob("dcgm")
klog.Info("Begining invasive health checks, updating node label =TESTING for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": "TESTING"
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
err := utils.CreateJob("dcgm")
if err != nil {
klog.Info("Invasive health checks failed, updating node label for node ", os.Getenv("NODE_NAME"))
label := `
{
"metadata": {
"labels": {
"autopilot.ibm.com/gpuhealth": ""
}
}
}
`
utils.PatchNode(label, os.Getenv("NODE_NAME"))
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion autopilot-daemon/pkg/utils/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func GPUsAvailability() bool {
return true
}

func CreateJob(healthcheck string) {
func CreateJob(healthcheck string) error {
var args []string
var cmd []string
switch healthcheck {
Expand All @@ -114,12 +114,14 @@ func CreateJob(healthcheck string) {
fieldselector, err := fields.ParseSelector("metadata.name=" + os.Getenv("POD_NAME"))
if err != nil {
klog.Info("Error in creating the field selector", err.Error())
return err
}
pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldselector.String(),
})
if err != nil {
klog.Info("Cannot get pod:", err.Error())
return err
}
autopilotPod := pods.Items[0]
ttlsec := int32(30) // setting TTL to 30 sec
Expand Down Expand Up @@ -178,8 +180,10 @@ func CreateJob(healthcheck string) {
metav1.CreateOptions{})
if err != nil {
klog.Info("Couldn't create Job ", err.Error())
return err
}
klog.Info("Created")
return nil
}

func CreatePVC() error {
Expand Down

0 comments on commit 68bc14a

Please sign in to comment.