Skip to content

Commit

Permalink
Added precheck for scheduler (flyteorg#254)
Browse files Browse the repository at this point in the history
* Added new command in scheduler for pre-check

Signed-off-by: Yuvraj <[email protected]>
  • Loading branch information
yindia authored Sep 14, 2021
1 parent 6eacbd0 commit 5fbaa13
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 6 deletions.
1 change: 0 additions & 1 deletion flyteadmin/cmd/entrypoints/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ func newGRPCServer(ctx context.Context, cfg *config.ServerConfig, authCtx interf
healthServer := health.NewServer()
healthServer.SetServingStatus("", grpc_health_v1.HealthCheckResponse_SERVING)
grpc_health_v1.RegisterHealthServer(grpcServer, healthServer)

if cfg.GrpcServerReflection {
reflection.Register(grpcServer)
}
Expand Down
89 changes: 89 additions & 0 deletions flyteadmin/cmd/scheduler/entrypoints/precheck.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package entrypoints

import (
"context"
"fmt"
"time"

"github.com/flyteorg/flytestdlib/logger"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/metadata"
"google.golang.org/grpc/status"

"github.com/avast/retry-go"
adminClient "github.com/flyteorg/flyteidl/clients/go/admin"
"github.com/pkg/errors"
healthpb "google.golang.org/grpc/health/grpc_health_v1"

"github.com/spf13/cobra"
)

const (
timeout = 30 * time.Second
timeoutError = "timeout: failed to connect service %q within %v"
connectionError = "error: failed to connect service at %q: %+v"
deadlineError = "timeout: health rpc did not complete within %v"
healthCheckError = "Health check failed with status %v"
healthCheckSuccess = "Health check passed, Flyteadmin is up and running"
)

var preCheckRunCmd = &cobra.Command{
Use: "precheck",
Short: "This command will check pre requirement for scheduler",
RunE: func(cmd *cobra.Command, args []string) error {
opts := []grpc.DialOption{
grpc.WithUserAgent("grpc_health_probe"),
grpc.WithBlock(),
grpc.WithInsecure(),
}
ctx := context.Background()
config := adminClient.GetConfig(ctx)

err := retry.Do(
func() error {
dialCtx, dialCancel := context.WithTimeout(ctx, timeout)
defer dialCancel()
conn, err := grpc.DialContext(dialCtx, config.Endpoint.String(), opts...)
if err != nil {
if err == context.DeadlineExceeded {
logger.Errorf(ctx, timeoutError, config.Endpoint.String(), timeout)
return errors.New(fmt.Sprintf(timeoutError, config.Endpoint.String(), timeout))
}
logger.Errorf(ctx, connectionError, config.Endpoint.String(), err)
return errors.New(fmt.Sprintf(connectionError, config.Endpoint.String(), err))
}
rpcCtx := metadata.NewOutgoingContext(ctx, metadata.MD{})
resp, err := healthpb.NewHealthClient(conn).Check(rpcCtx,
&healthpb.HealthCheckRequest{
Service: "",
})
if err != nil {
if stat, ok := status.FromError(err); ok && stat.Code() == codes.Unimplemented {
return retry.Unrecoverable(err)
} else if stat, ok := status.FromError(err); ok && stat.Code() == codes.DeadlineExceeded {
logger.Errorf(ctx, deadlineError, timeout)
return errors.New(fmt.Sprintf(deadlineError, timeout))
}
return err
}
if resp.GetStatus() != healthpb.HealthCheckResponse_SERVING {
logger.Errorf(ctx, healthCheckError, resp.GetStatus())
return errors.New(fmt.Sprintf(healthCheckError, resp.GetStatus()))
}
return nil
},
retry.Delay(retry.BackOffDelay(10, nil, &retry.Config{})),
)
if err != nil {
return err
}

logger.Printf(ctx, healthCheckSuccess)
return nil
},
}

func init() {
RootCmd.AddCommand(preCheckRunCmd)
}
4 changes: 2 additions & 2 deletions flyteadmin/cmd/scheduler/main.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package main

import (
entrypoints2 "github.com/flyteorg/flyteadmin/cmd/scheduler/entrypoints"
"github.com/flyteorg/flyteadmin/cmd/scheduler/entrypoints"
"github.com/golang/glog"
)

func main() {
glog.V(2).Info("Beginning Flyte Scheduler")
err := entrypoints2.Execute()
err := entrypoints.Execute()
if err != nil {
panic(err)
}
Expand Down
2 changes: 1 addition & 1 deletion flyteadmin/flyteadmin_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,4 @@ qualityOfService:
staging: MEDIUM
# by default production has an UNDEFINED tier when it is omitted from the configuration
namespace_mapping:
template: "{{ project }}-{{ domain }}" # Default namespace mapping template.
template: "{{ project }}-{{ domain }}" # Default namespace mapping template.
1 change: 1 addition & 0 deletions flyteadmin/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/Azure/go-autorest/autorest v0.11.18 // indirect
github.com/NYTimes/gizmo v1.3.6
github.com/Selvatico/go-mocket v1.0.7
github.com/avast/retry-go v3.0.0+incompatible
github.com/aws/aws-sdk-go v1.37.31
github.com/benbjohnson/clock v1.1.0
github.com/bradfitz/gomemcache v0.0.0-20190913173617-a41fca850d0b // indirect
Expand Down
4 changes: 2 additions & 2 deletions flyteadmin/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ github.com/asaskevich/govalidator v0.0.0-20180720115003-f9ffefc3facf/go.mod h1:l
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 h1:4daAzAu0S6Vi7/lbWECcX0j45yZReDZ56BQsrVBOEEY=
github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535/go.mod h1:oGkLhpf+kjZl6xBf758TQhh5XrAeiJv/7FRz/2spLIg=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/amazon-sagemaker-operator-for-k8s v1.0.1-0.20210303003444-0fb33b1fd49d/go.mod h1:mZUP7GJmjiWtf8v3FD1X/QdK08BqyeH/1Ejt0qhNzCs=
github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU=
github.com/aws/aws-sdk-go v1.23.4/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
Expand Down Expand Up @@ -1441,7 +1443,6 @@ go.uber.org/atomic v1.5.1/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10 h1:z+mqJhf6ss6BSfSM671tgKyZBFPTTJM+HLxnhPC3wu0=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
Expand Down Expand Up @@ -2082,7 +2083,6 @@ honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWh
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4 h1:UoveltGrhghAA7ePc+e+QYDHXrBps2PqFZiHkGR/xK8=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
howett.net/plist v0.0.0-20181124034731-591f970eefbb/go.mod h1:vMygbs4qMhSZSc4lCUl2OEE+rDiIIJAIdR4m7MiMcm0=
k8s.io/api v0.0.0-20210217171935-8e2decd92398/go.mod h1:60tmSUpHxGPFerNHbo/ayI2lKxvtrhbxFyXuEIWJd78=
Expand Down

0 comments on commit 5fbaa13

Please sign in to comment.