Skip to content

Commit

Permalink
CPU and GPU schedule type jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
jwierzbo committed Dec 15, 2023
1 parent f64afee commit fac86f9
Show file tree
Hide file tree
Showing 9 changed files with 2,297 additions and 987 deletions.
350 changes: 322 additions & 28 deletions docs/api/ArangoMLExtension.V1Alpha1.md

Large diffs are not rendered by default.

58 changes: 0 additions & 58 deletions pkg/apis/ml/v1alpha1/batchjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,9 @@
package v1alpha1

import (
"strings"

meta "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/arangodb/kube-arangodb/pkg/apis/ml"
mlShared "github.com/arangodb/kube-arangodb/pkg/handlers/enterprise/ml/shared"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down Expand Up @@ -71,57 +67,3 @@ func (a *ArangoMLBatchJob) GetStatus() ArangoMLBatchJobStatus {
func (a *ArangoMLBatchJob) SetStatus(status ArangoMLBatchJobStatus) {
a.Status = status
}

func (a *ArangoMLBatchJob) GetJobType() string {
val, ok := a.Labels[mlShared.MLJobTypeLabel]
if !ok {
return ""
}
return strings.ToLower(val)
}

func (a *ArangoMLBatchJob) GetScheduleType() string {
val, ok := a.Labels[mlShared.MLJobScheduleLabel]
if !ok {
return ""
}
return strings.ToLower(val)
}

func (a *ArangoMLBatchJob) GetMLDeploymentName() string {
val, ok := a.Labels[mlShared.MLJobScheduleLabel]
if !ok {
return ""
}
return val
}

func (a *ArangoMLBatchJob) ValidateLabels() error {
depl, ok := a.Labels[mlShared.MLDeploymentLabel]
if !ok {
return errors.Newf("Job missing label: %s", mlShared.MLDeploymentLabel)
}
if depl == "" {
return errors.Newf("Job empty value for label: %s", mlShared.MLDeploymentLabel)
}

t, ok := a.Labels[mlShared.MLJobTypeLabel]
if !ok {
return errors.Newf("Job missing label: %s", mlShared.MLJobTypeLabel)
}
jobType := strings.ToLower(t)
if jobType != mlShared.MLJobTrainingType && jobType != mlShared.MLJobPredictionType {
return errors.Newf("Job label (%s) has unexpected value: %s", mlShared.MLJobTypeLabel, t)
}

s, ok := a.Labels[mlShared.MLJobScheduleLabel]
if !ok {
return errors.Newf("Job missing label: %s", mlShared.MLJobTypeLabel)
}
scheduleType := strings.ToLower(s)
if scheduleType != mlShared.MLJobScheduleCPU && scheduleType != mlShared.MLJobScheduleGPU {
return errors.Newf("Job label (%s) has unexpected value: %s", mlShared.MLJobScheduleLabel, s)
}

return nil
}
2 changes: 1 addition & 1 deletion pkg/apis/ml/v1alpha1/batchjob_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (a *ArangoMLBatchJobSpec) Validate() error {

var err []error
if a.JobSpec == nil {
err = append(err, shared.PrefixResourceErrors("spec", errors.Newf("JobSpec is not defined")))
return shared.PrefixResourceErrors("spec", errors.Newf("JobSpec is not defined"))
}

if len(a.JobSpec.Template.Spec.Containers) != 1 {
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/ml/v1alpha1/cronjob_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (a *ArangoMLCronJobSpec) Validate() error {

var err []error
if a.CronJobSpec == nil {
err = append(err, shared.PrefixResourceErrors("spec", errors.Newf("CronJobSpec is not defined")))
return shared.PrefixResourceErrors("spec", errors.Newf("CronJobSpec is not defined"))
}

if len(a.CronJobSpec.JobTemplate.Spec.Template.Spec.Containers) != 1 {
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/ml/v1alpha1/extension_conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package v1alpha1
import api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"

const (
MLExtensionFoundCondition api.ConditionType = "MLExtensionFound"
ExtensionStorageFoundCondition api.ConditionType = "StorageFound"
ExtensionDeploymentFoundCondition api.ConditionType = "DeploymentFound"
ExtensionBootstrapCompletedCondition api.ConditionType = "BootstrapCompleted"
Expand Down
69 changes: 59 additions & 10 deletions pkg/apis/ml/v1alpha1/extension_spec_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,78 @@ import (
sharedApi "github.com/arangodb/kube-arangodb/pkg/apis/shared/v1"
)

type JobType string

const (
MLJobTrainingType JobType = "training"
MLJobPredictionType JobType = "prediction"
)

type ArangoMLJobsTemplates struct {
// Prediction defines template for the prediction job
Prediction map[string]*ArangoMLExtensionTemplateSpec `json:"prediction,omitempty"`
Prediction *ArangoMLJobTemplates `json:"prediction,omitempty"`

// Training defines template for the training job
Training map[string]*ArangoMLExtensionTemplateSpec `json:"training,omitempty"`
Training *ArangoMLJobTemplates `json:"training,omitempty"`
}

func (a *ArangoMLJobsTemplates) GetJobTemplates(jobType JobType) *ArangoMLJobTemplates {
switch jobType {
case MLJobTrainingType:
return a.Prediction
case MLJobPredictionType:
return a.Training
default:
return nil
}
}

func (j *ArangoMLJobsTemplates) Validate() error {
if j == nil {
func (a *ArangoMLJobsTemplates) Validate() error {
if a == nil {
return nil
}

var errs []error
for _, template := range j.Prediction {
errs = append(errs, shared.PrefixResourceErrors("prediction", template.Validate()))
return shared.WithErrors(
shared.PrefixResourceErrors("prediction", a.Prediction.Validate()),
shared.PrefixResourceErrors("training", a.Training.Validate()),
)
}

type JobScheduleType string

const (
MLJobScheduleCPU JobScheduleType = "cpu"
MLJobScheduleGPU JobScheduleType = "gpu"
)

type ArangoMLJobTemplates struct {
// CPU defines templates for CPU jobs
CPU *ArangoMLExtensionTemplateSpec `json:"cpu,omitempty"`

// GPU defines templates for GPU jobs
GPU *ArangoMLExtensionTemplateSpec `json:"gpu,omitempty"`
}

func (a *ArangoMLJobTemplates) GetJobTemplateSpec(scheduleType JobScheduleType) *ArangoMLExtensionTemplateSpec {
switch scheduleType {
case MLJobScheduleCPU:
return a.CPU
case MLJobScheduleGPU:
return a.GPU
default:
return nil
}
}

for _, template := range j.Training {
errs = append(errs, shared.PrefixResourceErrors("training", template.Validate()))
func (a *ArangoMLJobTemplates) Validate() error {
if a == nil {
return nil
}

return shared.WithErrors(errs...)
return shared.WithErrors(
shared.PrefixResourceErrors("cpu", a.CPU.Validate()),
shared.PrefixResourceErrors("gpu", a.GPU.Validate()),
)
}

type ArangoMLExtensionTemplateSpec struct {
Expand Down
54 changes: 30 additions & 24 deletions pkg/apis/ml/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit fac86f9

Please sign in to comment.