From 8110ea89091e839b5a8a988f2877cda973da89b1 Mon Sep 17 00:00:00 2001 From: Yicheng-Lu-llll Date: Mon, 5 Feb 2024 02:37:10 +0000 Subject: [PATCH] add status ActiveServingRayPods Signed-off-by: Yicheng-Lu-llll --- .../crds/ray.io_rayservices.yaml | 3 +++ .../templates/multiple_namespaces_role.yaml | 7 +++++ .../kuberay-operator/templates/role.yaml | 7 +++++ ray-operator/apis/ray/v1/rayservice_types.go | 3 +++ .../config/crd/bases/ray.io_rayservices.yaml | 3 +++ ray-operator/config/rbac/role.yaml | 7 +++++ .../controllers/ray/rayservice_controller.go | 26 ++++++++++++++++++- .../ray/v1/rayservicestatuses.go | 9 +++++++ 8 files changed, 64 insertions(+), 1 deletion(-) diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 3a548918e12..ebb92d98093 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -7227,6 +7227,9 @@ spec: type: object status: properties: + ActiveServingRayPods: + format: int32 + type: integer activeServiceStatus: properties: applicationStatuses: diff --git a/helm-chart/kuberay-operator/templates/multiple_namespaces_role.yaml b/helm-chart/kuberay-operator/templates/multiple_namespaces_role.yaml index 488889d508a..4925893bce6 100644 --- a/helm-chart/kuberay-operator/templates/multiple_namespaces_role.yaml +++ b/helm-chart/kuberay-operator/templates/multiple_namespaces_role.yaml @@ -32,6 +32,13 @@ rules: - get - list - update +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - list - apiGroups: - "" resources: diff --git a/helm-chart/kuberay-operator/templates/role.yaml b/helm-chart/kuberay-operator/templates/role.yaml index 8c5e0a27d75..cc1e3bf3752 100644 --- a/helm-chart/kuberay-operator/templates/role.yaml +++ b/helm-chart/kuberay-operator/templates/role.yaml @@ -28,6 +28,13 @@ rules: - get - list - update +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - list - apiGroups: - "" resources: diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 5ace7a92c5c..87b5f29f365 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -71,6 +71,9 @@ type RayServiceStatuses struct { PendingServiceStatus RayServiceStatus `json:"pendingServiceStatus,omitempty"` // ServiceStatus indicates the current RayService status. ServiceStatus ServiceStatus `json:"serviceStatus,omitempty"` + // ActiveServingRayPods indicates the number of Ray Pods that are actively serving or have been selected by the serve service. + // Ray Pods without a proxy actor or those that are unhealthy will not be counted. + ActiveServingRayPods int32 `json:"ActiveServingRayPods,omitempty"` // observedGeneration is the most recent generation observed for this RayService. It corresponds to the // RayService's generation, which is updated on mutation by the API Server. // +optional diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 3a548918e12..ebb92d98093 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -7227,6 +7227,9 @@ spec: type: object status: properties: + ActiveServingRayPods: + format: int32 + type: integer activeServiceStatus: properties: applicationStatuses: diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index 5877cf9edb6..c15fdd9bfcc 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -25,6 +25,13 @@ rules: - get - list - update +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - list - apiGroups: - "" resources: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 4fd509adffd..9feb0f18123 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -82,6 +82,7 @@ func NewRayServiceReconciler(mgr manager.Manager, dashboardClientFunc func() uti // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=core,resources=endpoints,verbs=get;list // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=services/status,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update @@ -285,6 +286,11 @@ func (r *RayServiceReconciler) inconsistentRayServiceStatuses(oldStatus rayv1.Ra return true } + if oldStatus.ActiveServingRayPods != newStatus.ActiveServingRayPods { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ActiveServingRayPods changed from %d to %d", oldStatus.ActiveServingRayPods, newStatus.ActiveServingRayPods)) + return true + } + if r.inconsistentRayServiceStatus(oldStatus.ActiveServiceStatus, newStatus.ActiveServiceStatus) { r.Log.Info("inconsistentRayServiceStatus RayService ActiveServiceStatus changed") return true @@ -908,6 +914,9 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService newSvc, err = common.BuildHeadServiceForRayService(ctx, *rayServiceInstance, *rayClusterInstance) case utils.ServingService: newSvc, err = common.BuildServeServiceForRayService(ctx, *rayServiceInstance, *rayClusterInstance) + if updateStatusErr := r.updateStatusForActiveServingRayPods(ctx, newSvc.Name, newSvc.Namespace, rayServiceInstance); updateStatusErr != nil { + return updateStatusErr + } default: return fmt.Errorf("unknown service type %v", serviceType) } @@ -919,7 +928,7 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService // Retrieve the Service from the Kubernetes cluster with the name and namespace. oldSvc := &corev1.Service{} - err = r.Get(ctx, client.ObjectKey{Name: newSvc.Name, Namespace: rayServiceInstance.Namespace}, oldSvc) + err = r.Get(ctx, client.ObjectKey{Name: newSvc.Name, Namespace: newSvc.Namespace}, oldSvc) if err == nil { // Only update the service if the RayCluster switches. @@ -963,6 +972,21 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService return nil } +func (r *RayServiceReconciler) updateStatusForActiveServingRayPods(ctx context.Context, endpointsName string, endpointsNamespace string, rayServiceInstance *rayv1.RayService) error { + numEndpoints := 0 + endpoints := &corev1.Endpoints{} + if err := r.Get(ctx, client.ObjectKey{Name: endpointsName, Namespace: endpointsNamespace}, endpoints); err == nil { + for _, subset := range endpoints.Subsets { + numEndpoints += len(subset.Addresses) + } + } else if !errors.IsNotFound(err) { + r.Log.Error(err, "Fail to retrieve the Kubernetes Endpoints from the cluster!") + return err + } + rayServiceInstance.Status.ActiveServingRayPods = int32(numEndpoints) + return nil +} + func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, logger logr.Logger) error { rayServiceInstance.Status.ActiveServiceStatus.RayClusterStatus = rayClusterInstance.Status diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go index cab88874e48..06d41058dd9 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go @@ -13,6 +13,7 @@ type RayServiceStatusesApplyConfiguration struct { ActiveServiceStatus *RayServiceStatusApplyConfiguration `json:"activeServiceStatus,omitempty"` PendingServiceStatus *RayServiceStatusApplyConfiguration `json:"pendingServiceStatus,omitempty"` ServiceStatus *rayv1.ServiceStatus `json:"serviceStatus,omitempty"` + ActiveServingRayPods *int32 `json:"ActiveServingRayPods,omitempty"` ObservedGeneration *int64 `json:"observedGeneration,omitempty"` LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` } @@ -47,6 +48,14 @@ func (b *RayServiceStatusesApplyConfiguration) WithServiceStatus(value rayv1.Ser return b } +// WithActiveServingRayPods sets the ActiveServingRayPods field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ActiveServingRayPods field is set to the value of the last call. +func (b *RayServiceStatusesApplyConfiguration) WithActiveServingRayPods(value int32) *RayServiceStatusesApplyConfiguration { + b.ActiveServingRayPods = &value + return b +} + // WithObservedGeneration sets the ObservedGeneration field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ObservedGeneration field is set to the value of the last call.