From 21fdf3ffebbf9a4801dc1f26281b650a3dc875e8 Mon Sep 17 00:00:00 2001 From: Aaron Liang Date: Wed, 16 Oct 2024 17:01:13 -0700 Subject: [PATCH] Add 'updateStrategy' field to RayServiceSpec, allowing users to disable/enable zero-downtime update. --- docs/reference/api.md | 14 ++++++++++++++ .../crds/ray.io_rayservices.yaml | 2 ++ ray-operator/apis/ray/v1/rayservice_types.go | 9 +++++++++ .../config/crd/bases/ray.io_rayservices.yaml | 2 ++ .../controllers/ray/rayservice_controller.go | 9 +++++++-- .../ray/rayservice_controller_unit_test.go | 16 ++++++++++++++++ .../applyconfiguration/ray/v1/rayservicespec.go | 10 ++++++++++ 7 files changed, 60 insertions(+), 2 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 9a0f75e18e..2ded41ba5a 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -207,6 +207,7 @@ _Appears in:_ | `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | +| `upgradeStrategy` _[UpgradeStrategy](#upgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `BlueGreenUpgrade` and `NoZeroDowntimeUpgrade` | | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | @@ -245,6 +246,19 @@ _Appears in:_ | `backoffLimit` _integer_ | BackoffLimit of the submitter k8s job. | | | +#### UpgradeStrategy + +_Underlying type:_ _string_ + + + + + +_Appears in:_ +- [RayServiceSpec](#rayservicespec) + + + #### UpscalingMode _Underlying type:_ _string_ diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 4a4545280a..cd718750e6 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -7930,6 +7930,8 @@ spec: serviceUnhealthySecondThreshold: format: int32 type: integer + upgradeStrategy: + type: string type: object status: properties: diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index ca8e2a3a0e..0c174f8347 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -20,6 +20,13 @@ const ( FailedToUpdateService ServiceStatus = "FailedToUpdateService" ) +type UpgradeStrategy string + +const ( + BlueGreenUpgrade UpgradeStrategy = "BlueGreenUpgrade" + NoZeroDowntimeUpgrade UpgradeStrategy = "NoZeroDowntimeUpgrade" +) + // These statuses should match Ray Serve's application statuses // See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details. var ApplicationStatusEnum = struct { @@ -57,6 +64,8 @@ type RayServiceSpec struct { DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` // ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. ServeService *corev1.Service `json:"serveService,omitempty"` + // UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `BlueGreenUpgrade` and `NoZeroDowntimeUpgrade` + UpgradeStrategy UpgradeStrategy `json:"upgradeStrategy,omitempty"` // Important: Run "make" to regenerate code after modifying this file // Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. ServeConfigV2 string `json:"serveConfigV2,omitempty"` diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 4a4545280a..cd718750e6 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -7930,6 +7930,8 @@ spec: serviceUnhealthySecondThreshold: format: int32 type: integer + upgradeStrategy: + type: string type: object status: properties: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 032d005b1b..229023cdcd 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -424,10 +424,15 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi if clusterAction == RolloutNew { // For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously. // Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades. + zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME) + rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy enableZeroDowntime := true - if s := os.Getenv(ENABLE_ZERO_DOWNTIME); strings.ToLower(s) == "false" { - enableZeroDowntime = false + if rayServiceSpecUpgradeStrategy != "" { + enableZeroDowntime = rayServiceSpecUpgradeStrategy == rayv1.BlueGreenUpgrade + } else if zeroDowntimeEnvVar != "" { + enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) == "true" } + if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil { // Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will // actually create the pending RayCluster instance. diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 7c3691fd2d..a002c2056e 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -768,6 +768,7 @@ func TestReconcileRayCluster(t *testing.T) { enableZeroDowntime bool shouldPrepareNewCluster bool updateKubeRayVersion bool + zeroDowntimeSpecTrigger bool }{ // Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set. "Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": { @@ -815,12 +816,27 @@ func TestReconcileRayCluster(t *testing.T) { updateKubeRayVersion: true, kubeRayVersion: "new-version", }, + // Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec + "Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": { + activeCluster: activeCluster.DeepCopy(), + updateRayClusterSpec: true, + enableZeroDowntime: true, + shouldPrepareNewCluster: true, + zeroDowntimeSpecTrigger: true, + }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { // Enable or disable zero-downtime upgrade. defer os.Unsetenv(ENABLE_ZERO_DOWNTIME) + if tc.enableZeroDowntime { + if tc.zeroDowntimeSpecTrigger { + rayService.Spec.UpgradeStrategy = rayv1.BlueGreenUpgrade + } else { + os.Setenv(ENABLE_ZERO_DOWNTIME, "true") + } + } if !tc.enableZeroDowntime { os.Setenv(ENABLE_ZERO_DOWNTIME, "false") } diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go index e128e604e2..b6f60d7698 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go @@ -3,6 +3,7 @@ package v1 import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" v1 "k8s.io/api/core/v1" ) @@ -12,6 +13,7 @@ type RayServiceSpecApplyConfiguration struct { ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"` DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` ServeService *v1.Service `json:"serveService,omitempty"` + UpgradeStrategy *rayv1.UpgradeStrategy `json:"upgradeStrategy,omitempty"` ServeConfigV2 *string `json:"serveConfigV2,omitempty"` RayClusterSpec *RayClusterSpecApplyConfiguration `json:"rayClusterConfig,omitempty"` } @@ -46,6 +48,14 @@ func (b *RayServiceSpecApplyConfiguration) WithServeService(value v1.Service) *R return b } +// WithUpgradeStrategy sets the UpgradeStrategy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the UpgradeStrategy field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithUpgradeStrategy(value rayv1.UpgradeStrategy) *RayServiceSpecApplyConfiguration { + b.UpgradeStrategy = &value + return b +} + // WithServeConfigV2 sets the ServeConfigV2 field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ServeConfigV2 field is set to the value of the last call.