diff --git a/cmd/entrypoints/serve.go b/cmd/entrypoints/serve.go index 6cd4c47125..7528f527b1 100644 --- a/cmd/entrypoints/serve.go +++ b/cmd/entrypoints/serve.go @@ -123,7 +123,7 @@ func newGRPCServer(ctx context.Context, cfg *config.ServerConfig, authCtx interf } healthServer := health.NewServer() - healthServer.SetServingStatus("", grpc_health_v1.HealthCheckResponse_SERVING) + healthServer.SetServingStatus("flyteadmin", grpc_health_v1.HealthCheckResponse_SERVING) grpc_health_v1.RegisterHealthServer(grpcServer, healthServer) if cfg.GrpcServerReflection { reflection.Register(grpcServer) diff --git a/cmd/scheduler/entrypoints/precheck.go b/cmd/scheduler/entrypoints/precheck.go index 9447d3a5e1..9593b93c91 100644 --- a/cmd/scheduler/entrypoints/precheck.go +++ b/cmd/scheduler/entrypoints/precheck.go @@ -3,83 +3,62 @@ package entrypoints import ( "context" "fmt" - "time" + "github.com/flyteorg/flyteadmin/pkg/runtime" + "github.com/flyteorg/flyteidl/clients/go/admin" "github.com/flyteorg/flytestdlib/logger" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/metadata" - "google.golang.org/grpc/status" - - "github.com/avast/retry-go" - adminClient "github.com/flyteorg/flyteidl/clients/go/admin" - "github.com/pkg/errors" - healthpb "google.golang.org/grpc/health/grpc_health_v1" "github.com/spf13/cobra" + "google.golang.org/grpc/health/grpc_health_v1" + "k8s.io/client-go/util/retry" ) const ( - timeout = 30 * time.Second - timeoutError = "timeout: failed to connect service %q within %v" - connectionError = "error: failed to connect service at %q: %+v" - deadlineError = "timeout: health rpc did not complete within %v" - healthCheckError = "Health check failed with status %v" healthCheckSuccess = "Health check passed, Flyteadmin is up and running" + healthCheckError = "Health check failed with status %v" ) var preCheckRunCmd = &cobra.Command{ Use: "precheck", Short: "This command will check pre requirement for scheduler", RunE: func(cmd *cobra.Command, args []string) error { - opts := []grpc.DialOption{ - grpc.WithUserAgent("grpc_health_probe"), - grpc.WithBlock(), - grpc.WithInsecure(), - } ctx := context.Background() - config := adminClient.GetConfig(ctx) - err := retry.Do( + appConfig := runtime.NewApplicationConfigurationProvider() + opts := appConfig.GetSchedulerConfig().GetPrecheckBackoff() + + err := retry.OnError(opts, + func(err error) bool { + logger.Errorf(ctx, "Attempt failed due to %v", err) + return err != nil + }, func() error { - dialCtx, dialCancel := context.WithTimeout(ctx, timeout) - defer dialCancel() - conn, err := grpc.DialContext(dialCtx, config.Endpoint.String(), opts...) + clientSet, err := admin.ClientSetBuilder().WithConfig(admin.GetConfig(ctx)).Build(ctx) + if err != nil { - if err == context.DeadlineExceeded { - logger.Errorf(ctx, timeoutError, config.Endpoint.String(), timeout) - return errors.New(fmt.Sprintf(timeoutError, config.Endpoint.String(), timeout)) - } - logger.Errorf(ctx, connectionError, config.Endpoint.String(), err) - return errors.New(fmt.Sprintf(connectionError, config.Endpoint.String(), err)) + logger.Errorf(ctx, "Flyte native scheduler precheck failed due to %v\n", err) + return err } - rpcCtx := metadata.NewOutgoingContext(ctx, metadata.MD{}) - resp, err := healthpb.NewHealthClient(conn).Check(rpcCtx, - &healthpb.HealthCheckRequest{ - Service: "", - }) + + healthCheckResponse, err := clientSet.HealthServiceClient().Check(ctx, + &grpc_health_v1.HealthCheckRequest{Service: "flyteadmin"}) if err != nil { - if stat, ok := status.FromError(err); ok && stat.Code() == codes.Unimplemented { - return retry.Unrecoverable(err) - } else if stat, ok := status.FromError(err); ok && stat.Code() == codes.DeadlineExceeded { - logger.Errorf(ctx, deadlineError, timeout) - return errors.New(fmt.Sprintf(deadlineError, timeout)) - } return err } - if resp.GetStatus() != healthpb.HealthCheckResponse_SERVING { - logger.Errorf(ctx, healthCheckError, resp.GetStatus()) - return errors.New(fmt.Sprintf(healthCheckError, resp.GetStatus())) + if healthCheckResponse.GetStatus() != grpc_health_v1.HealthCheckResponse_SERVING { + logger.Errorf(ctx, healthCheckError, healthCheckResponse.GetStatus()) + return fmt.Errorf(healthCheckError, healthCheckResponse.GetStatus()) } + logger.Infof(ctx, "Health check response is %v", healthCheckResponse) return nil }, - retry.Delay(retry.BackOffDelay(10, nil, &retry.Config{})), ) + if err != nil { return err } - logger.Printf(ctx, healthCheckSuccess) + logger.Infof(ctx, healthCheckSuccess) return nil }, } diff --git a/go.mod b/go.mod index f9fce19cb4..3343680dac 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,62 @@ go 1.17 require ( cloud.google.com/go v0.79.0 - cloud.google.com/go/pubsub v1.10.1 // indirect cloud.google.com/go/storage v1.14.0 + github.com/NYTimes/gizmo v1.3.6 + github.com/Selvatico/go-mocket v1.0.7 + github.com/aws/aws-sdk-go v1.37.31 + github.com/benbjohnson/clock v1.1.0 + github.com/coreos/go-oidc v2.2.1+incompatible + github.com/evanphx/json-patch v4.9.0+incompatible + github.com/flyteorg/flyteidl v0.21.8 + github.com/flyteorg/flyteplugins v0.7.0 + github.com/flyteorg/flytepropeller v0.14.11 + github.com/flyteorg/flytestdlib v0.3.36 + github.com/ghodss/yaml v1.0.0 + github.com/gogo/protobuf v1.3.2 + github.com/golang-jwt/jwt/v4 v4.1.0 + github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b + github.com/golang/protobuf v1.4.3 + github.com/google/uuid v1.2.0 + github.com/googleapis/gax-go/v2 v2.0.5 + github.com/gorilla/handlers v1.5.1 + github.com/gorilla/securecookie v1.1.1 + github.com/graymeta/stow v0.2.7 + github.com/grpc-ecosystem/go-grpc-middleware v1.2.2 + github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 + github.com/grpc-ecosystem/grpc-gateway v1.16.0 + github.com/gtank/cryptopasta v0.0.0-20170601214702-1f550f6f2f69 + github.com/jinzhu/gorm v1.9.16 + github.com/lestrrat-go/jwx v1.1.6 + github.com/lib/pq v1.10.0 + github.com/magiconair/properties v1.8.4 + github.com/mitchellh/mapstructure v1.4.1 + github.com/ory/fosite v0.39.0 + github.com/ory/x v0.0.162 + github.com/pkg/errors v0.9.1 + github.com/prometheus/client_golang v1.9.0 + github.com/prometheus/client_model v0.2.0 + github.com/qor/validations v0.0.0-20171228122639-f364bca61b46 + github.com/robfig/cron/v3 v3.0.0 + github.com/sendgrid/sendgrid-go v3.10.0+incompatible + github.com/spf13/cobra v1.1.3 + github.com/spf13/pflag v1.0.5 + github.com/stretchr/testify v1.7.0 + golang.org/x/oauth2 v0.0.0-20210313182246-cd4f82c27b84 + golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba + google.golang.org/api v0.42.0 + google.golang.org/genproto v0.0.0-20210315173758-2651cd453018 + google.golang.org/grpc v1.36.0 + google.golang.org/protobuf v1.25.0 + gopkg.in/gormigrate.v1 v1.6.0 + k8s.io/api v0.20.4 + k8s.io/apimachinery v0.20.4 + k8s.io/client-go v0.20.2 + sigs.k8s.io/controller-runtime v0.8.3 +) + +require ( + cloud.google.com/go/pubsub v1.10.1 // indirect github.com/Azure/azure-sdk-for-go v52.4.0+incompatible // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/Azure/go-autorest/autorest v0.11.18 // indirect @@ -13,60 +67,34 @@ require ( github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect github.com/Azure/go-autorest/logger v0.2.1 // indirect github.com/Azure/go-autorest/tracing v0.6.0 // indirect - github.com/NYTimes/gizmo v1.3.6 - github.com/Selvatico/go-mocket v1.0.7 github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 // indirect - github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.37.31 - github.com/benbjohnson/clock v1.1.0 github.com/benlaurie/objecthash v0.0.0-20180202135721-d1e3d6079fc1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bradfitz/gomemcache v0.0.0-20190913173617-a41fca850d0b // indirect github.com/cespare/xxhash v1.1.0 // indirect github.com/cespare/xxhash/v2 v2.1.1 // indirect github.com/coocood/freecache v1.1.1 // indirect - github.com/coreos/go-oidc v2.2.1+incompatible github.com/davecgh/go-spew v1.1.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v3 v3.0.0 // indirect github.com/dgraph-io/ristretto v0.0.3 // indirect github.com/dgrijalva/jwt-go v3.2.0+incompatible // indirect - github.com/evanphx/json-patch v4.9.0+incompatible github.com/fatih/color v1.10.0 // indirect github.com/felixge/httpsnoop v1.0.1 // indirect - github.com/flyteorg/flyteidl v0.21.8 - github.com/flyteorg/flyteplugins v0.7.0 - github.com/flyteorg/flytepropeller v0.14.11 - github.com/flyteorg/flytestdlib v0.3.36 github.com/form3tech-oss/jwt-go v3.2.2+incompatible // indirect github.com/fsnotify/fsnotify v1.4.9 // indirect - github.com/ghodss/yaml v1.0.0 github.com/go-logr/logr v0.4.0 // indirect github.com/go-test/deep v1.0.7 // indirect github.com/goccy/go-json v0.4.8 // indirect github.com/gofrs/uuid v4.0.0+incompatible // indirect - github.com/gogo/protobuf v1.3.2 - github.com/golang-jwt/jwt/v4 v4.1.0 - github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect - github.com/golang/protobuf v1.4.3 github.com/google/go-cmp v0.5.5 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.2.0 - github.com/googleapis/gax-go/v2 v2.0.5 github.com/googleapis/gnostic v0.5.4 // indirect - github.com/gorilla/handlers v1.5.1 - github.com/gorilla/securecookie v1.1.1 github.com/gorilla/websocket v1.4.2 // indirect - github.com/graymeta/stow v0.2.7 - github.com/grpc-ecosystem/go-grpc-middleware v1.2.2 - github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 - github.com/grpc-ecosystem/grpc-gateway v1.16.0 - github.com/gtank/cryptopasta v0.0.0-20170601214702-1f550f6f2f69 github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect - github.com/jinzhu/gorm v1.9.16 github.com/jinzhu/inflection v1.0.0 // indirect github.com/jinzhu/now v1.1.1 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect @@ -76,82 +104,56 @@ require ( github.com/lestrrat-go/backoff/v2 v2.0.7 // indirect github.com/lestrrat-go/httpcc v1.0.0 // indirect github.com/lestrrat-go/iter v1.0.1 // indirect - github.com/lestrrat-go/jwx v1.1.6 github.com/lestrrat-go/option v1.0.0 // indirect - github.com/lib/pq v1.10.0 - github.com/magiconair/properties v1.8.4 github.com/mattn/go-colorable v0.1.8 // indirect github.com/mattn/go-isatty v0.0.12 // indirect github.com/mattn/goveralls v0.0.6 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect - github.com/mitchellh/mapstructure v1.4.1 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.1 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/ncw/swift v1.0.53 // indirect - github.com/ory/fosite v0.39.0 github.com/ory/go-acc v0.2.5 // indirect github.com/ory/go-convenience v0.1.0 // indirect github.com/ory/viper v1.7.5 // indirect - github.com/ory/x v0.0.162 github.com/pborman/uuid v1.2.0 // indirect github.com/pelletier/go-toml v1.8.1 // indirect github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 // indirect - github.com/pkg/errors v0.9.1 github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pquerna/cachecontrol v0.0.0-20201205024021-ac21108117ac // indirect - github.com/prometheus/client_golang v1.9.0 - github.com/prometheus/client_model v0.2.0 github.com/prometheus/common v0.19.0 // indirect github.com/prometheus/procfs v0.6.0 // indirect github.com/qor/qor v1.2.0 // indirect - github.com/qor/validations v0.0.0-20171228122639-f364bca61b46 - github.com/robfig/cron/v3 v3.0.0 github.com/sendgrid/rest v2.6.4+incompatible // indirect - github.com/sendgrid/sendgrid-go v3.10.0+incompatible github.com/sirupsen/logrus v1.8.1 // indirect github.com/spf13/afero v1.5.1 // indirect github.com/spf13/cast v1.3.1 // indirect - github.com/spf13/cobra v1.1.3 github.com/spf13/jwalterweatherman v1.1.0 // indirect - github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.7.1 // indirect github.com/stretchr/objx v0.3.0 // indirect - github.com/stretchr/testify v1.7.0 github.com/subosito/gotenv v1.2.0 // indirect go.opencensus.io v0.23.0 // indirect golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b // indirect golang.org/x/lint v0.0.0-20201208152925-83fdc39ff7b5 // indirect golang.org/x/mod v0.4.2 // indirect golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4 // indirect - golang.org/x/oauth2 v0.0.0-20210313182246-cd4f82c27b84 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect golang.org/x/sys v0.0.0-20210510120138-977fb7262007 // indirect golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d // indirect golang.org/x/text v0.3.5 // indirect - golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba golang.org/x/tools v0.1.2 // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect - google.golang.org/api v0.42.0 google.golang.org/appengine v1.6.7 // indirect - google.golang.org/genproto v0.0.0-20210315173758-2651cd453018 - google.golang.org/grpc v1.36.0 google.golang.org/grpc/examples v0.0.0-20210315211313-1e7119b13689 // indirect - google.golang.org/protobuf v1.25.0 gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b // indirect - gopkg.in/gormigrate.v1 v1.6.0 gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.62.0 // indirect gopkg.in/square/go-jose.v2 v2.5.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect - k8s.io/api v0.20.4 - k8s.io/apimachinery v0.20.4 - k8s.io/client-go v0.20.2 k8s.io/klog/v2 v2.8.0 // indirect k8s.io/kube-openapi v0.0.0-20210305164622-f622666832c1 // indirect k8s.io/utils v0.0.0-20210305010621-2afb4311ab10 // indirect - sigs.k8s.io/controller-runtime v0.8.3 sigs.k8s.io/structured-merge-diff/v4 v4.1.0 // indirect sigs.k8s.io/yaml v1.2.0 // indirect ) diff --git a/go.sum b/go.sum index a88cbbe0e8..cbd1ddc992 100644 --- a/go.sum +++ b/go.sum @@ -146,8 +146,6 @@ github.com/asaskevich/govalidator v0.0.0-20180720115003-f9ffefc3facf/go.mod h1:l github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 h1:4daAzAu0S6Vi7/lbWECcX0j45yZReDZ56BQsrVBOEEY= github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535/go.mod h1:oGkLhpf+kjZl6xBf758TQhh5XrAeiJv/7FRz/2spLIg= -github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= -github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/amazon-sagemaker-operator-for-k8s v1.0.1-0.20210303003444-0fb33b1fd49d/go.mod h1:mZUP7GJmjiWtf8v3FD1X/QdK08BqyeH/1Ejt0qhNzCs= github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU= github.com/aws/aws-sdk-go v1.23.4/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= @@ -311,8 +309,6 @@ github.com/felixge/httpsnoop v1.0.1 h1:lvB5Jl89CsZtGIWuTcDM1E/vkVs49/Ml7JJe07l8S github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flyteorg/flyteidl v0.21.0/go.mod h1:576W2ViEyjTpT+kEVHAGbrTP3HARNUZ/eCwrNPmdx9U= github.com/flyteorg/flyteidl v0.21.1/go.mod h1:576W2ViEyjTpT+kEVHAGbrTP3HARNUZ/eCwrNPmdx9U= -github.com/flyteorg/flyteidl v0.21.4 h1:gtJK5rX2ydLAo2xLRHHznOSLuLHrRRdXDbpEAlxluhk= -github.com/flyteorg/flyteidl v0.21.4/go.mod h1:576W2ViEyjTpT+kEVHAGbrTP3HARNUZ/eCwrNPmdx9U= github.com/flyteorg/flyteidl v0.21.8 h1:dICKVt3bwhkjm3l/knZIczkTiG+TNAOLQ/4ytk0pM4Y= github.com/flyteorg/flyteidl v0.21.8/go.mod h1:576W2ViEyjTpT+kEVHAGbrTP3HARNUZ/eCwrNPmdx9U= github.com/flyteorg/flyteplugins v0.7.0 h1:Cy7qqUhoXcLRYVW1b0wtk9+WtMGNFFXZ+O5weCzplvA= diff --git a/pkg/runtime/application_config_provider.go b/pkg/runtime/application_config_provider.go index 400a981009..aa1d919e53 100644 --- a/pkg/runtime/application_config_provider.go +++ b/pkg/runtime/application_config_provider.go @@ -5,11 +5,14 @@ import ( "io/ioutil" "os" "strings" + "time" "github.com/flyteorg/flyteadmin/pkg/common" "github.com/flyteorg/flyteadmin/pkg/runtime/interfaces" "github.com/flyteorg/flytestdlib/config" "github.com/flyteorg/flytestdlib/logger" + + "k8s.io/apimachinery/pkg/util/wait" ) const database = "database" @@ -56,6 +59,9 @@ var schedulerConfig = config.MustRegisterSection(scheduler, &interfaces.Schedule }, }, }, + PrecheckBackoff: wait.Backoff{ + Duration: time.Second, Factor: 2.0, Steps: 30, Jitter: 0.1, + }, }) var remoteDataConfig = config.MustRegisterSection(remoteData, &interfaces.RemoteDataConfig{ Scheme: common.None, diff --git a/pkg/runtime/interfaces/application_configuration.go b/pkg/runtime/interfaces/application_configuration.go index 926e3a106b..eda02a9a1e 100644 --- a/pkg/runtime/interfaces/application_configuration.go +++ b/pkg/runtime/interfaces/application_configuration.go @@ -3,6 +3,7 @@ package interfaces import ( "github.com/flyteorg/flytestdlib/config" "golang.org/x/time/rate" + "k8s.io/apimachinery/pkg/util/wait" ) // This configuration section is used to for initiating the database connection with the store that holds registered @@ -275,6 +276,8 @@ type SchedulerConfig struct { ReconnectAttempts int `json:"reconnectAttempts"` // Specifies the time interval to wait before attempting to reconnect the workflow executor client. ReconnectDelaySeconds int `json:"reconnectDelaySeconds"` + // Specifies the backoff settings when scheduler checks for the flyteadmin health during startup. + PrecheckBackoff wait.Backoff `json:"backoff"` } func (s *SchedulerConfig) GetEventSchedulerConfig() EventSchedulerConfig { @@ -293,6 +296,10 @@ func (s *SchedulerConfig) GetReconnectDelaySeconds() int { return s.ReconnectDelaySeconds } +func (s *SchedulerConfig) GetPrecheckBackoff() wait.Backoff { + return s.PrecheckBackoff +} + // Configuration specific to setting up signed urls. type SignedURL struct { // Whether signed urls should even be returned with GetExecutionData, GetNodeExecutionData and GetTaskExecutionData