diff --git a/docs/reference/api.md b/docs/reference/api.md index 9a0f75e18e..7f8c668212 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -207,12 +207,26 @@ _Appears in:_ | `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | +| `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None` | NewCluster | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | +#### RayServiceUpgradeStrategy + +_Underlying type:_ _string_ + + + + + +_Appears in:_ +- [RayServiceSpec](#rayservicespec) + + + #### ScaleStrategy diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 4a4545280a..6bf06b771b 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -7930,6 +7930,9 @@ spec: serviceUnhealthySecondThreshold: format: int32 type: integer + upgradeStrategy: + default: NewCluster + type: string type: object status: properties: diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index ca8e2a3a0e..e56c751c76 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -20,6 +20,15 @@ const ( FailedToUpdateService ServiceStatus = "FailedToUpdateService" ) +type RayServiceUpgradeStrategy string + +const ( + // During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready + NewCluster RayServiceUpgradeStrategy = "NewCluster" + // Having None as the strategy will mean down-time while it wait for new cluster to be ready when performing upgrade + None RayServiceUpgradeStrategy = "None" +) + // These statuses should match Ray Serve's application statuses // See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details. var ApplicationStatusEnum = struct { @@ -57,6 +66,9 @@ type RayServiceSpec struct { DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` // ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. ServeService *corev1.Service `json:"serveService,omitempty"` + // UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None` + // +kubebuilder:default:=NewCluster + UpgradeStrategy RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"` // Important: Run "make" to regenerate code after modifying this file // Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. ServeConfigV2 string `json:"serveConfigV2,omitempty"` diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 4a4545280a..6bf06b771b 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -7930,6 +7930,9 @@ spec: serviceUnhealthySecondThreshold: format: int32 type: integer + upgradeStrategy: + default: NewCluster + type: string type: object status: properties: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 032d005b1b..e001b4ee27 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -248,6 +248,12 @@ func validateRayServiceSpec(rayService *rayv1.RayService) error { if headSvc := rayService.Spec.RayClusterSpec.HeadGroupSpec.HeadService; headSvc != nil && headSvc.Name != "" { return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set") } + + if upgradeStrategy := rayService.Spec.UpgradeStrategy; upgradeStrategy != "" { + if upgradeStrategy != rayv1.NewCluster && upgradeStrategy != rayv1.None { + return fmt.Errorf("spec.UpgradeStrategy is invalid, valid options are %s and %s", rayv1.NewCluster, rayv1.None) + } + } return nil } @@ -424,10 +430,19 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi if clusterAction == RolloutNew { // For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously. // Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades. + zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME) + rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy + // There are two ways to enable zero downtime upgrade. Through ENABLE_ZERO_DOWNTIME env var or setting Spec.UpgradeStrategy. + // If no fields are set, zero downtime upgrade by default is enabled. + // Spec.UpgradeStrategy takes precedence over ENABLE_ZERO_DOWNTIME. enableZeroDowntime := true - if s := os.Getenv(ENABLE_ZERO_DOWNTIME); strings.ToLower(s) == "false" { - enableZeroDowntime = false + if zeroDowntimeEnvVar != "" { + enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) != "false" } + if rayServiceSpecUpgradeStrategy != "" { + enableZeroDowntime = rayServiceSpecUpgradeStrategy == rayv1.NewCluster + } + if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil { // Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will // actually create the pending RayCluster instance. diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 7c3691fd2d..f7c7ba9a44 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -44,6 +44,13 @@ func TestValidateRayServiceSpec(t *testing.T) { Spec: rayv1.RayServiceSpec{}, }) assert.NoError(t, err, "The RayService spec is valid.") + + err = validateRayServiceSpec(&rayv1.RayService{ + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: "invalidStrategy", + }, + }) + assert.Error(t, err, "spec.UpgradeStrategy is invalid") } func TestGenerateHashWithoutReplicasAndWorkersToDelete(t *testing.T) { @@ -762,12 +769,13 @@ func TestReconcileRayCluster(t *testing.T) { } tests := map[string]struct { - activeCluster *rayv1.RayCluster - kubeRayVersion string - updateRayClusterSpec bool - enableZeroDowntime bool - shouldPrepareNewCluster bool - updateKubeRayVersion bool + activeCluster *rayv1.RayCluster + rayServiceUpgradeStrategy rayv1.RayServiceUpgradeStrategy + kubeRayVersion string + updateRayClusterSpec bool + enableZeroDowntime bool + shouldPrepareNewCluster bool + updateKubeRayVersion bool }{ // Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set. "Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": { @@ -790,8 +798,8 @@ func TestReconcileRayCluster(t *testing.T) { enableZeroDowntime: true, shouldPrepareNewCluster: true, }, - // Test 4: The active cluster exists. Trigger the zero-downtime upgrade. - "Zero-downtime upgrade is disabled. The active cluster exists. Trigger the zero-downtime upgrade.": { + // Test 4: The active cluster exists. Zero-downtime upgrade is false, should not trigger zero-downtime upgrade. + "Zero-downtime upgrade is disabled. The active cluster exists. Does not trigger the zero-downtime upgrade.": { activeCluster: activeCluster.DeepCopy(), updateRayClusterSpec: true, enableZeroDowntime: false, @@ -815,6 +823,37 @@ func TestReconcileRayCluster(t *testing.T) { updateKubeRayVersion: true, kubeRayVersion: "new-version", }, + // Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec + "Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": { + activeCluster: activeCluster.DeepCopy(), + updateRayClusterSpec: true, + enableZeroDowntime: true, + shouldPrepareNewCluster: true, + rayServiceUpgradeStrategy: rayv1.NewCluster, + }, + // Test 8: Zero downtime upgrade is enabled. Env var is set to false but RayServiceSpec is set to NewCluster. Trigger the zero-downtime upgrade. + "Zero-downtime upgrade is enabled through RayServiceSpec and not through env var. Active cluster exist. Trigger the zero-downtime upgrade.": { + activeCluster: activeCluster.DeepCopy(), + updateRayClusterSpec: true, + enableZeroDowntime: false, + shouldPrepareNewCluster: true, + rayServiceUpgradeStrategy: rayv1.NewCluster, + }, + // Test 9: Zero downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None. + "Zero-downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None.": { + activeCluster: activeCluster.DeepCopy(), + updateRayClusterSpec: true, + enableZeroDowntime: true, + shouldPrepareNewCluster: false, + rayServiceUpgradeStrategy: rayv1.None, + }, + // Test 10: Zero downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set. Trigger the zero-downtime upgrade. + "Zero-downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set.": { + activeCluster: nil, + updateRayClusterSpec: true, + shouldPrepareNewCluster: true, + rayServiceUpgradeStrategy: "", + }, } for name, tc := range tests { @@ -824,6 +863,9 @@ func TestReconcileRayCluster(t *testing.T) { if !tc.enableZeroDowntime { os.Setenv(ENABLE_ZERO_DOWNTIME, "false") } + if tc.rayServiceUpgradeStrategy != "" { + rayService.Spec.UpgradeStrategy = tc.rayServiceUpgradeStrategy + } runtimeObjects := []runtime.Object{} if tc.activeCluster != nil { // Update 'ray.io/kuberay-version' to a new version if kubeRayVersion is set. diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go index e128e604e2..f92c8c543d 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go @@ -3,6 +3,7 @@ package v1 import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" v1 "k8s.io/api/core/v1" ) @@ -12,6 +13,7 @@ type RayServiceSpecApplyConfiguration struct { ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"` DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` ServeService *v1.Service `json:"serveService,omitempty"` + UpgradeStrategy *rayv1.RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"` ServeConfigV2 *string `json:"serveConfigV2,omitempty"` RayClusterSpec *RayClusterSpecApplyConfiguration `json:"rayClusterConfig,omitempty"` } @@ -46,6 +48,14 @@ func (b *RayServiceSpecApplyConfiguration) WithServeService(value v1.Service) *R return b } +// WithUpgradeStrategy sets the UpgradeStrategy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the UpgradeStrategy field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithUpgradeStrategy(value rayv1.RayServiceUpgradeStrategy) *RayServiceSpecApplyConfiguration { + b.UpgradeStrategy = &value + return b +} + // WithServeConfigV2 sets the ServeConfigV2 field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ServeConfigV2 field is set to the value of the last call.