Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use HTTP probes for Ray readiness and liviness probes #2360

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 10 additions & 41 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
ctrl "sigs.k8s.io/controller-runtime"
)

Expand Down Expand Up @@ -248,72 +249,40 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
}

func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) {
rayAgentRayletHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeTimeoutSeconds,
utils.DefaultDashboardAgentListenPort,
utils.RayAgentRayletHealthPath,
)
rayDashboardGCSHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeFailureThreshold,
utils.DefaultDashboardPort,
utils.RayDashboardGCSHealthPath,
)

// Generally, the liveness and readiness probes perform the same checks.
// For head node => Check GCS and Raylet status.
// For worker node => Check Raylet status.
commands := []string{}
healthCheckPath := utils.RayAgentRayletHealthPath
healthCheckPort := intstr.FromInt(utils.DefaultDashboardAgentListenPort)
if rayNodeType == rayv1.HeadNode {
commands = append(commands, rayAgentRayletHealthCommand, rayDashboardGCSHealthCommand)
} else {
commands = append(commands, rayAgentRayletHealthCommand)
healthCheckPath = utils.RayDashboardGCSHealthPath
healthCheckPort = intstr.FromInt(utils.DefaultDashboardPort)
}

if rayContainer.LivenessProbe == nil {
probeTimeout := int32(utils.DefaultLivenessProbeTimeoutSeconds)
if rayNodeType == rayv1.HeadNode {
probeTimeout = int32(utils.DefaultHeadLivenessProbeTimeoutSeconds)
}

rayContainer.LivenessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: probeTimeout,
TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds,
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
}
rayContainer.LivenessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.LivenessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using HTTP probes means we can only query 1 endpoint per probe now. For head pod this would /api/gcs_healthz and for worker pod it would be api/local_raylet_healthz. I'm not sure if not health checking api/local_raylet_healthz in the head pod is problematic, it would depend on what whether /api/gcs_healthz incorporates raylet health in some way as well

}

if rayContainer.ReadinessProbe == nil {
probeTimeout := int32(utils.DefaultReadinessProbeTimeoutSeconds)
if rayNodeType == rayv1.HeadNode {
probeTimeout = int32(utils.DefaultHeadReadinessProbeTimeoutSeconds)
}
rayContainer.ReadinessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: probeTimeout,
TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds,
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,
}
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}

// For worker Pods serving traffic, we need to add an additional HTTP proxy health check for the readiness probe.
// Note: head Pod checks the HTTP proxy's health at every rayservice controller reconcile instaed of using readiness probe.
// See https://github.com/ray-project/kuberay/pull/1808 for reasons.
if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode {
rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold
rayServeProxyHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeInitialDelaySeconds,
utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort),
utils.RayServeProxyHealthPath,
)
commands = append(commands, rayServeProxyHealthCommand)
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: utils.RayServeProxyHealthPath, Port: intstr.FromInt(utils.DefaultServingPort)}
}
}
}
Expand Down
67 changes: 43 additions & 24 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1167,35 +1167,33 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
podTemplateSpec := DefaultHeadPodTemplate(context.Background(), *cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
rayContainer := &podTemplateSpec.Spec.Containers[utils.RayContainerIndex]

// Test 1: User defines a custom HTTPGet probe.
httpGetProbe := corev1.Probe{
// Test 1: User defines a custom Exec probe to override default HTTP probe.
execProbe := corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
// Check Raylet status
Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath),
Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort),
Exec: &corev1.ExecAction{
Command: []string{"foo", "bar"},
},
},
}

rayContainer.LivenessProbe = &httpGetProbe
rayContainer.ReadinessProbe = &httpGetProbe
rayContainer.LivenessProbe = &execProbe
rayContainer.ReadinessProbe = &execProbe
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, "")
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Nil(t, rayContainer.LivenessProbe.Exec)
assert.Nil(t, rayContainer.ReadinessProbe.Exec)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.Nil(t, rayContainer.LivenessProbe.HTTPGet)
assert.Nil(t, rayContainer.ReadinessProbe.HTTPGet)

// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
// Test 2: User does not define a custom probe. KubeRay will inject HTTP probe for worker pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayServeProxyHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultServingPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

Expand All @@ -1205,13 +1203,34 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
// head pod should not have Ray Serve proxy health probes
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 4: User does not define custom probe. Pod is a worker Pod for a RayJob
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayJobCRD)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayAgentRayletHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardAgentListenPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 5: User does not define custom probe. Pod is a head Pod for a RayJob
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayJobCRD)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)
}

func TestGenerateRayStartCommand(t *testing.T) {
Expand Down
18 changes: 7 additions & 11 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,21 +151,17 @@ const (
// Ray FT default readiness probe values
DefaultReadinessProbeInitialDelaySeconds = 10
DefaultReadinessProbeTimeoutSeconds = 2
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadReadinessProbeTimeoutSeconds = 5
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1

// Ray FT default liveness probe values
DefaultLivenessProbeInitialDelaySeconds = 30
DefaultLivenessProbeTimeoutSeconds = 2
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadLivenessProbeTimeoutSeconds = 5
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120

// Ray health check related configurations
// Note: Since the Raylet process and the dashboard agent process are fate-sharing,
Expand Down
Loading