From f5163e93f2dabbb2a2ebe32f644388e61456a769 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 4 Nov 2024 19:04:11 +0000 Subject: [PATCH] Add offline support for LMEvalJobs (#351) --- api/lmes/v1alpha1/lmevaljob_types.go | 17 + api/lmes/v1alpha1/zz_generated.deepcopy.go | 36 ++ .../trustyai.opendatahub.io_lmevaljobs.yaml | 16 + controllers/lmes/constants.go | 1 + controllers/lmes/lmevaljob_controller.go | 35 ++ controllers/lmes/lmevaljob_controller_test.go | 312 ++++++++++++++++++ 6 files changed, 417 insertions(+) diff --git a/api/lmes/v1alpha1/lmevaljob_types.go b/api/lmes/v1alpha1/lmevaljob_types.go index a7138a0c..e54a4937 100644 --- a/api/lmes/v1alpha1/lmevaljob_types.go +++ b/api/lmes/v1alpha1/lmevaljob_types.go @@ -218,6 +218,16 @@ func (p *LMEvalPodSpec) GetSideCards() []corev1.Container { return p.SideCars } +// OfflineStorageSpec defines the storage configuration for LMEvalJob's offline mode +type OfflineStorageSpec struct { + PersistentVolumeClaimName string `json:"pvcName"` +} + +// OfflineSpec defined the configuration for LMEvalJob's offline mode +type OfflineSpec struct { + StorageSpec OfflineStorageSpec `json:"storage"` +} + // LMEvalJobSpec defines the desired state of LMEvalJob type LMEvalJobSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster @@ -257,6 +267,13 @@ type LMEvalJobSpec struct { // Outputs specifies storage for evaluation results // +optional Outputs *Outputs `json:"outputs,omitempty"` + // Offline specifies settings for running LMEvalJobs in a offline mode + Offline *OfflineSpec `json:"offline,omitempty"` +} + +// IsOffline returns whether this LMEvalJob is configured to run offline +func (s *LMEvalJobSpec) IsOffline() bool { + return s.Offline != nil } // HasCustomOutput returns whether an LMEvalJobSpec defines custom outputs or not diff --git a/api/lmes/v1alpha1/zz_generated.deepcopy.go b/api/lmes/v1alpha1/zz_generated.deepcopy.go index ffc83f10..92f43ee8 100644 --- a/api/lmes/v1alpha1/zz_generated.deepcopy.go +++ b/api/lmes/v1alpha1/zz_generated.deepcopy.go @@ -187,6 +187,11 @@ func (in *LMEvalJobSpec) DeepCopyInto(out *LMEvalJobSpec) { *out = new(Outputs) (*in).DeepCopyInto(*out) } + if in.Offline != nil { + in, out := &in.Offline, &out.Offline + *out = new(OfflineSpec) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LMEvalJobSpec. @@ -256,6 +261,37 @@ func (in *LMEvalPodSpec) DeepCopy() *LMEvalPodSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OfflineSpec) DeepCopyInto(out *OfflineSpec) { + *out = *in + out.StorageSpec = in.StorageSpec +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OfflineSpec. +func (in *OfflineSpec) DeepCopy() *OfflineSpec { + if in == nil { + return nil + } + out := new(OfflineSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OfflineStorageSpec) DeepCopyInto(out *OfflineStorageSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OfflineStorageSpec. +func (in *OfflineStorageSpec) DeepCopy() *OfflineStorageSpec { + if in == nil { + return nil + } + out := new(OfflineStorageSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Outputs) DeepCopyInto(out *Outputs) { *out = *in diff --git a/config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml b/config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml index 8d232b84..4081b100 100644 --- a/config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml +++ b/config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml @@ -89,6 +89,22 @@ spec: numFewShot: description: Sets the number of few-shot examples to place in context type: integer + offline: + description: Offline specifies settings for running LMEvalJobs in + a offline mode + properties: + storage: + description: OfflineStorageSpec defines the storage configuration + for LMEvalJob's offline mode + properties: + pvcName: + type: string + required: + - pvcName + type: object + required: + - storage + type: object outputs: description: Outputs specifies storage for evaluation results properties: diff --git a/controllers/lmes/constants.go b/controllers/lmes/constants.go index 252523cf..598a7287 100644 --- a/controllers/lmes/constants.go +++ b/controllers/lmes/constants.go @@ -26,6 +26,7 @@ const ( DriverPath = "/bin/driver" DestDriverPath = "/opt/app-root/src/bin/driver" OutputPath = "/opt/app-root/src/output" + HuggingFaceHomePath = "/opt/app-root/src/hf_home" PodImageKey = "lmes-pod-image" DriverImageKey = "lmes-driver-image" PodCheckingIntervalKey = "lmes-pod-checking-interval" diff --git a/controllers/lmes/lmevaljob_controller.go b/controllers/lmes/lmevaljob_controller.go index e610c750..76bce36e 100644 --- a/controllers/lmes/lmevaljob_controller.go +++ b/controllers/lmes/lmevaljob_controller.go @@ -692,6 +692,41 @@ func createPod(svcOpts *serviceOptions, job *lmesv1alpha1.LMEvalJob, log logr.Lo volumes = append(volumes, outputPVC) } + // If the job is supposed to run offline, set the appropriate HuggingFace offline flags + if job.Spec.IsOffline() { + + offlineHuggingFaceEnvVars := []corev1.EnvVar{ + { + Name: "HF_DATASETS_OFFLINE", + Value: "1", + }, + { + Name: "HF_HUB_OFFLINE", + Value: "1", + }, + } + envVars = append(envVars, offlineHuggingFaceEnvVars...) + + // If the job is offline, a storage must be set. PVC is the only supported storage backend at the moment. + offlinePVCMount := corev1.VolumeMount{ + Name: "offline", + MountPath: HuggingFaceHomePath, + } + volumeMounts = append(volumeMounts, offlinePVCMount) + + offlinePVC := corev1.Volume{ + Name: "offline", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: job.Spec.Offline.StorageSpec.PersistentVolumeClaimName, + ReadOnly: false, + }, + }, + } + volumes = append(volumes, offlinePVC) + + } + volumes = append(volumes, job.Spec.Pod.GetVolumes()...) volumeMounts = append(volumeMounts, job.Spec.Pod.GetContainer().GetVolumMounts()...) labels := getPodLabels(job.Labels, log) diff --git a/controllers/lmes/lmevaljob_controller_test.go b/controllers/lmes/lmevaljob_controller_test.go index 33ab08dd..d6a3d07f 100644 --- a/controllers/lmes/lmevaljob_controller_test.go +++ b/controllers/lmes/lmevaljob_controller_test.go @@ -1392,3 +1392,315 @@ func Test_ValidateBatchSize(t *testing.T) { } } } + +// Test_OfflineMode tests that if the offline mode is set the configuration is correct +func Test_OfflineMode(t *testing.T) { + log := log.FromContext(context.Background()) + svcOpts := &serviceOptions{ + PodImage: "podimage:latest", + DriverImage: "driver:latest", + ImagePullPolicy: corev1.PullAlways, + } + + jobName := "test" + pvcName := "my-pvc" + var job = &lmesv1alpha1.LMEvalJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: jobName, + Namespace: "default", + UID: "for-testing", + }, + TypeMeta: metav1.TypeMeta{ + Kind: lmesv1alpha1.KindName, + APIVersion: lmesv1alpha1.Version, + }, + Spec: lmesv1alpha1.LMEvalJobSpec{ + Model: "test", + ModelArgs: []lmesv1alpha1.Arg{ + {Name: "arg1", Value: "value1"}, + }, + TaskList: lmesv1alpha1.TaskList{ + TaskNames: []string{"task1", "task2"}, + }, + Offline: &lmesv1alpha1.OfflineSpec{ + StorageSpec: lmesv1alpha1.OfflineStorageSpec{ + PersistentVolumeClaimName: pvcName, + }, + }, + }, + } + + expect := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "default", + Labels: map[string]string{ + "app.kubernetes.io/name": "ta-lmes", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: lmesv1alpha1.Version, + Kind: lmesv1alpha1.KindName, + Name: "test", + Controller: &isController, + UID: "for-testing", + }, + }, + }, + TypeMeta: metav1.TypeMeta{ + Kind: "Pod", + APIVersion: "v1", + }, + Spec: corev1.PodSpec{ + InitContainers: []corev1.Container{ + { + Name: "driver", + Image: svcOpts.DriverImage, + ImagePullPolicy: svcOpts.ImagePullPolicy, + Command: []string{DriverPath, "--copy", DestDriverPath}, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &allowPrivilegeEscalation, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{ + "ALL", + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "shared", + MountPath: "/opt/app-root/src/bin", + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "main", + Image: svcOpts.PodImage, + ImagePullPolicy: svcOpts.ImagePullPolicy, + Command: generateCmd(svcOpts, job), + Args: generateArgs(svcOpts, job, log), + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &allowPrivilegeEscalation, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{ + "ALL", + }, + }, + }, + Env: []corev1.EnvVar{ + { + Name: "HF_DATASETS_OFFLINE", + Value: "1", + }, + { + Name: "HF_HUB_OFFLINE", + Value: "1", + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "shared", + MountPath: "/opt/app-root/src/bin", + }, + { + Name: "offline", + MountPath: "/opt/app-root/src/hf_home", + }, + }, + }, + }, + SecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: &runAsNonRootUser, + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "shared", VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: "offline", VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + ReadOnly: false, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + } + + newPod := createPod(svcOpts, job, log) + + assert.Equal(t, expect, newPod) +} + +// Test_OfflineModeWithOutput tests that if the offline mode is set the configuration is correct, even when custom output is set +func Test_OfflineModeWithOutput(t *testing.T) { + log := log.FromContext(context.Background()) + svcOpts := &serviceOptions{ + PodImage: "podimage:latest", + DriverImage: "driver:latest", + ImagePullPolicy: corev1.PullAlways, + } + + jobName := "test" + offlinePvcName := "offline-pvc" + outputPvcName := "output-pvc" + var job = &lmesv1alpha1.LMEvalJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: jobName, + Namespace: "default", + UID: "for-testing", + }, + TypeMeta: metav1.TypeMeta{ + Kind: lmesv1alpha1.KindName, + APIVersion: lmesv1alpha1.Version, + }, + Spec: lmesv1alpha1.LMEvalJobSpec{ + Model: "test", + ModelArgs: []lmesv1alpha1.Arg{ + {Name: "arg1", Value: "value1"}, + }, + TaskList: lmesv1alpha1.TaskList{ + TaskNames: []string{"task1", "task2"}, + }, + Offline: &lmesv1alpha1.OfflineSpec{ + StorageSpec: lmesv1alpha1.OfflineStorageSpec{ + PersistentVolumeClaimName: offlinePvcName, + }, + }, + Outputs: &lmesv1alpha1.Outputs{ + PersistentVolumeClaimName: &outputPvcName, + }, + }, + } + + expect := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "default", + Labels: map[string]string{ + "app.kubernetes.io/name": "ta-lmes", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: lmesv1alpha1.Version, + Kind: lmesv1alpha1.KindName, + Name: "test", + Controller: &isController, + UID: "for-testing", + }, + }, + }, + TypeMeta: metav1.TypeMeta{ + Kind: "Pod", + APIVersion: "v1", + }, + Spec: corev1.PodSpec{ + InitContainers: []corev1.Container{ + { + Name: "driver", + Image: svcOpts.DriverImage, + ImagePullPolicy: svcOpts.ImagePullPolicy, + Command: []string{DriverPath, "--copy", DestDriverPath}, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &allowPrivilegeEscalation, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{ + "ALL", + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "shared", + MountPath: "/opt/app-root/src/bin", + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "main", + Image: svcOpts.PodImage, + ImagePullPolicy: svcOpts.ImagePullPolicy, + Command: generateCmd(svcOpts, job), + Args: generateArgs(svcOpts, job, log), + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &allowPrivilegeEscalation, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{ + "ALL", + }, + }, + }, + Env: []corev1.EnvVar{ + { + Name: "HF_DATASETS_OFFLINE", + Value: "1", + }, + { + Name: "HF_HUB_OFFLINE", + Value: "1", + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "shared", + MountPath: "/opt/app-root/src/bin", + }, + { + Name: "outputs", + MountPath: "/opt/app-root/src/output", + }, + { + Name: "offline", + MountPath: "/opt/app-root/src/hf_home", + }, + }, + }, + }, + SecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: &runAsNonRootUser, + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "shared", VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: "outputs", VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: outputPvcName, + ReadOnly: false, + }, + }, + }, + { + Name: "offline", VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: offlinePvcName, + ReadOnly: false, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + } + + newPod := createPod(svcOpts, job, log) + + assert.Equal(t, expect, newPod) +}