From 981c943b568325531d785db493fadd5d9b34480e Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Tue, 16 Apr 2024 17:05:19 -0700 Subject: [PATCH] [Hotfix] Increase the timeout of the ProxyActor health check (#2082) --- .../controllers/ray/rayservice_controller.go | 2 +- .../ray/utils/fake_httpproxy_httpclient.go | 3 ++- .../ray/utils/httpproxy_httpclient.go | 25 +++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index e2f45ea930..fbf08d5b72 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1138,7 +1138,7 @@ func (r *RayServiceReconciler) labelHeadPodForServeStatus(ctx context.Context, r originalLabels[key] = value } - if httpProxyClient.CheckHealth() == nil { + if err = httpProxyClient.CheckProxyActorHealth(ctx); err == nil { headPod.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue } else { headPod.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceFalse diff --git a/ray-operator/controllers/ray/utils/fake_httpproxy_httpclient.go b/ray-operator/controllers/ray/utils/fake_httpproxy_httpclient.go index 4df3cabff0..f0761b31d5 100644 --- a/ray-operator/controllers/ray/utils/fake_httpproxy_httpclient.go +++ b/ray-operator/controllers/ray/utils/fake_httpproxy_httpclient.go @@ -1,6 +1,7 @@ package utils import ( + "context" "fmt" "net/http" "time" @@ -21,7 +22,7 @@ func (r *FakeRayHttpProxyClient) SetHostIp(hostIp string, port int) { r.httpProxyURL = fmt.Sprintf("http://%s:%d", hostIp, port) } -func (r *FakeRayHttpProxyClient) CheckHealth() error { +func (r *FakeRayHttpProxyClient) CheckProxyActorHealth(ctx context.Context) error { // TODO: test check return error cases. // Always return successful. return nil diff --git a/ray-operator/controllers/ray/utils/httpproxy_httpclient.go b/ray-operator/controllers/ray/utils/httpproxy_httpclient.go index 9856c5d04f..e7fabc34f3 100644 --- a/ray-operator/controllers/ray/utils/httpproxy_httpclient.go +++ b/ray-operator/controllers/ray/utils/httpproxy_httpclient.go @@ -1,15 +1,18 @@ package utils import ( + "context" "fmt" "io" "net/http" "time" + + ctrl "sigs.k8s.io/controller-runtime" ) type RayHttpProxyClientInterface interface { InitClient() - CheckHealth() error + CheckProxyActorHealth(ctx context.Context) error SetHostIp(hostIp string, port int) } @@ -24,7 +27,7 @@ type RayHttpProxyClient struct { func (r *RayHttpProxyClient) InitClient() { r.client = http.Client{ - Timeout: 20 * time.Millisecond, + Timeout: 2 * time.Second, } } @@ -32,21 +35,21 @@ func (r *RayHttpProxyClient) SetHostIp(hostIp string, port int) { r.httpProxyURL = fmt.Sprintf("http://%s:%d/", hostIp, port) } -func (r *RayHttpProxyClient) CheckHealth() error { - req, err := http.NewRequest("GET", r.httpProxyURL+RayServeProxyHealthPath, nil) - if err != nil { - return err - } - - resp, err := r.client.Do(req) +// CheckProxyActorHealth checks the health status of the Ray Serve proxy actor. +func (r *RayHttpProxyClient) CheckProxyActorHealth(ctx context.Context) error { + logger := ctrl.LoggerFrom(ctx) + resp, err := r.client.Get(r.httpProxyURL + RayServeProxyHealthPath) if err != nil { + logger.Error(err, "CheckProxyActorHealth fails.") return err } defer resp.Body.Close() body, _ := io.ReadAll(resp.Body) - if resp.StatusCode < 200 || resp.StatusCode > 299 { - return fmt.Errorf("RayHttpProxyClient CheckHealth fail: %s %s", resp.Status, string(body)) + if resp.StatusCode != 200 { + err := fmt.Errorf("CheckProxyActorHealth fails: Status code is not 200") + logger.Error(err, "CheckProxyActorHealth fails.", "status code", resp.StatusCode, "status", resp.Status, "body", string(body)) + return err } return nil