From ec2953caa6d236c873e7a9994dde2bd5edd92666 Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Mon, 16 Jan 2023 15:32:52 +0100 Subject: [PATCH] gRPC expander: allow realistic server responses, and log errors Expander requests' payloads can be rather heavy under upscale pressure, as they're compounding all candidates options and unschedulable pods that could fit each options. Expander responses are a subset of the requests' payload items. We're allowing ourself to send arbitrary payload sizes (gRPC `defaultClientMaxSendMessageSize` is `math.MaxInt32`), but we're prone to drop expander servers responses to the floor, due to the `4MiB` `defaultClientMaxReceiveMessageSize`. The arbitrary 128MiB value is meant to be huge (enough to support eg. several dozen fat 1MiB pods) but not unlimited. Let me know if you'd rather see that turned to be a command line flag, or an other value. Also logging the possible gRPC call errors, as that of great help to diagnose that kind of issues. --- .../expander/grpcplugin/grpc_client.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cluster-autoscaler/expander/grpcplugin/grpc_client.go b/cluster-autoscaler/expander/grpcplugin/grpc_client.go index 7800bd27013..b31de9fa402 100644 --- a/cluster-autoscaler/expander/grpcplugin/grpc_client.go +++ b/cluster-autoscaler/expander/grpcplugin/grpc_client.go @@ -31,7 +31,10 @@ import ( "google.golang.org/grpc/credentials" ) -const gRPCTimeout = 5 * time.Second +const ( + gRPCTimeout = 5 * time.Second + gRPCMaxRecvMsgSize = 128 << 20 +) type grpcclientstrategy struct { grpcClient protos.ExpanderClient @@ -47,8 +50,6 @@ func NewFilter(expanderCert string, expanderUrl string) expander.Filter { } func createGRPCClient(expanderCert string, expanderUrl string) protos.ExpanderClient { - var dialOpt grpc.DialOption - if expanderCert == "" { log.Fatalf("GRPC Expander Cert not specified, insecure connections not allowed") return nil @@ -58,9 +59,12 @@ func createGRPCClient(expanderCert string, expanderUrl string) protos.ExpanderCl log.Fatalf("Failed to create TLS credentials %v", err) return nil } - dialOpt = grpc.WithTransportCredentials(creds) - klog.V(2).Infof("Dialing: %s with dialopt: %v", expanderUrl, dialOpt) - conn, err := grpc.Dial(expanderUrl, dialOpt) + dialOpts := []grpc.DialOption{ + grpc.WithTransportCredentials(creds), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(gRPCMaxRecvMsgSize)), + } + klog.V(2).Infof("Dialing: %s with dialopt: %v", expanderUrl, dialOpts) + conn, err := grpc.Dial(expanderUrl, dialOpts...) if err != nil { log.Fatalf("Fail to dial server: %v", err) return nil @@ -84,7 +88,7 @@ func (g *grpcclientstrategy) BestOptions(expansionOptions []expander.Option, nod defer cancel() bestOptionsResponse, err := g.grpcClient.BestOptions(ctx, &protos.BestOptionsRequest{Options: grpcOptionsSlice, NodeMap: grpcNodeMap}) if err != nil { - klog.V(4).Info("GRPC call timed out, no options filtered") + klog.V(4).Infof("GRPC call failed, no options filtered: %v", err) return expansionOptions }