Skip to content
This repository has been archived by the owner on Feb 9, 2024. It is now read-only.

Commit

Permalink
Check cluster status after upload (#1709)
Browse files Browse the repository at this point in the history
  • Loading branch information
r0mant authored Jun 24, 2020
1 parent 58f63c8 commit eba6a82
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 16 deletions.
38 changes: 27 additions & 11 deletions lib/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ import (
pb "github.com/gravitational/satellite/agent/proto/agentpb"
"github.com/gravitational/satellite/monitoring"
"github.com/gravitational/trace"
"github.com/sirupsen/logrus"
)

// FromCluster collects cluster status information.
Expand All @@ -60,30 +59,30 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o

status.Agent, err = FromPlanetAgent(ctx, cluster.ClusterState.Servers)
if err != nil {
logrus.WithError(err).Warn("Failed to collect system status from agents.")
log.WithError(err).Warn("Failed to collect system status from agents.")
}

if err := updateClusterNodes(cluster.Key(), operator, status); err != nil {
logrus.WithError(err).Warn("Failed to query cluster nodes.")
log.WithError(err).Warn("Failed to query cluster nodes.")
}

token, err := operator.GetExpandToken(cluster.Key())
if err != nil && !trace.IsNotFound(err) {
logrus.WithError(err).Warn("Failed to fetch expand token.")
log.WithError(err).Warn("Failed to fetch expand token.")
}
if token != nil {
status.Token = *token
}

status.Cluster.ServerVersion, err = operator.GetVersion(ctx)
if err != nil {
logrus.WithError(err).Warn("Failed to query server version information.")
log.WithError(err).Warn("Failed to query server version information.")
}

// Collect application endpoints.
appEndpoints, err := operator.GetApplicationEndpoints(cluster.Key())
if err != nil {
logrus.WithError(err).Warn("Failed to fetch application endpoints.")
log.WithError(err).Warn("Failed to fetch application endpoints.")
status.Endpoints.Applications.Error = err
}
if len(appEndpoints) != 0 {
Expand All @@ -99,7 +98,7 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o
// Fetch cluster endpoints.
clusterEndpoints, err := ops.GetClusterEndpoints(operator, cluster.Key())
if err != nil {
logrus.WithError(err).Warn("Failed to fetch cluster endpoints.")
log.WithError(err).Warn("Failed to fetch cluster endpoints.")
}
if clusterEndpoints != nil {
status.Endpoints.Cluster.AuthGateway = clusterEndpoints.AuthGateways()
Expand All @@ -108,15 +107,15 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o

// FIXME: have status extension accept the operator/environment
if err := status.Cluster.Extension.Collect(); err != nil {
logrus.WithError(err).Warn("Failed to query extension metadata.")
log.WithError(err).Warn("Failed to query extension metadata.")
}

if err := collectActiveOperations(cluster.Key(), operator, status); err != nil {
logrus.WithError(err).Warn("Failed to query active operations.")
log.WithError(err).Warn("Failed to query active operations.")
}

if err := fetchOperationByID(cluster.Key(), operationID, operator, status); err != nil {
logrus.WithError(err).WithField("operation-id", operationID).Warn("Failed to query operation.")
log.WithError(err).WithField("operation-id", operationID).Warn("Failed to query operation.")
}

status.State = cluster.State
Expand Down Expand Up @@ -180,6 +179,23 @@ func (r Status) IsDegraded() bool {
r.Agent.GetSystemStatus() != pb.SystemStatus_Running)
}

// String returns the status string representation.
func (r Status) String() string {
var cluster string
if r.Cluster != nil {
cluster = fmt.Sprintf("Cluster(%v)", *r.Cluster)
} else {
cluster = "Cluster(nil)"
}
var agent string
if r.Agent != nil {
agent = fmt.Sprintf("Agent(%v)", *r.Agent)
} else {
agent = "Agent(nil)"
}
return fmt.Sprintf("Status(%v, %v)", cluster, agent)
}

// Status describes the status of the cluster as a whole
type Status struct {
// Cluster describes the operational status of the cluster
Expand Down Expand Up @@ -638,7 +654,7 @@ func probeErrorDetail(p pb.Probe) string {
if err == nil {
return detail
}
logrus.WithError(err).Warn("Failed to compose disk space probe error.")
log.WithError(err).Warn("Failed to compose disk space probe error.")
}
detail := p.Detail
if p.Detail == "" {
Expand Down
1 change: 0 additions & 1 deletion lib/status/timeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"github.com/fatih/color"
pb "github.com/gravitational/satellite/agent/proto/agentpb"
"github.com/gravitational/trace"
log "github.com/sirupsen/logrus"
)

// Timeline queries the currently stored cluster timeline.
Expand Down
51 changes: 51 additions & 0 deletions lib/status/wait.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
Copyright 2019 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package status

import (
"context"

"github.com/gravitational/gravity/lib/defaults"
"github.com/gravitational/gravity/lib/ops"
"github.com/gravitational/gravity/lib/utils"

"github.com/gravitational/trace"
"github.com/sirupsen/logrus"
)

var log = logrus.WithField(trace.Component, "status")

// WaitCluster blocks until the local cluster is healthy or until the context
// expires.
func WaitCluster(ctx context.Context, operator ops.Operator) error {
b := utils.NewExponentialBackOff(defaults.NodeStatusTimeout)
return utils.RetryWithInterval(ctx, b, func() error {
cluster, err := operator.GetLocalSite()
if err != nil {
return trace.Wrap(err)
}
status, err := FromCluster(ctx, operator, *cluster, "")
if err != nil {
return trace.Wrap(err)
}
if status.IsDegraded() {
return trace.BadParameter("cluster is not healthy: %s", status)
}
log.Info("Cluster is healthy.")
return nil
})
}
25 changes: 22 additions & 3 deletions tool/gravity/cli/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/gravitational/gravity/lib/pack"
"github.com/gravitational/gravity/lib/pack/encryptedpack"
"github.com/gravitational/gravity/lib/state"
libstatus "github.com/gravitational/gravity/lib/status"
"github.com/gravitational/gravity/lib/storage"
"github.com/gravitational/gravity/lib/users"
"github.com/gravitational/gravity/lib/utils"
Expand Down Expand Up @@ -86,7 +87,7 @@ func appPackage(env *localenv.LocalEnvironment) error {
return nil
}

func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
func uploadUpdate(ctx context.Context, env *localenv.LocalEnvironment, opsURL string) error {
// create local environment with gravity state dir because the environment
// provided above has upgrade tarball as a state dir
localStateDir, err := localenv.LocalGravityDir()
Expand Down Expand Up @@ -170,7 +171,7 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {

var registries []string
err = utils.Retry(defaults.RetryInterval, defaults.RetryLessAttempts, func() error {
registries, err = getRegistries(context.TODO(), defaultEnv, cluster.ClusterState.Servers)
registries, err = getRegistries(ctx, defaultEnv, cluster.ClusterState.Servers)
return trace.Wrap(err)
})
if err != nil {
Expand All @@ -196,7 +197,7 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
if err != nil {
return trace.Wrap(err)
}
err = appservice.SyncApp(context.TODO(), appservice.SyncRequest{
err = appservice.SyncApp(ctx, appservice.SyncRequest{
PackService: tarballPackages,
AppService: tarballApps,
ImageService: imageService,
Expand All @@ -207,6 +208,24 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
}
}

// Uploading new blobs to the cluster is known to cause stress on disk
// which can lead to the cluster's health checker experiencing momentary
// blips and potentially moving the cluster to degraded state, especially
// when running on a hardware with sub-par I/O performance.
//
// To accommodate this behavior and make sure upgrade (which normally
// follows upload right away) does not fail to launch due to the degraded
// state, give the cluster a few minutes to settle.
//
// See https://github.com/gravitational/gravity/issues/1659 for more info.
env.PrintStep("Verifying cluster health")
ctx, cancel := context.WithTimeout(ctx, defaults.NodeStatusTimeout)
defer cancel()
err = libstatus.WaitCluster(ctx, clusterOperator)
if err != nil {
return trace.Wrap(err)
}

env.PrintStep("Application has been uploaded")
return nil
}
Expand Down
4 changes: 3 additions & 1 deletion tool/gravity/cli/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func InitAndCheck(g *Application, cmd string) error {
g.UpdateTriggerCmd.FullCommand(),
g.UpdatePlanInitCmd.FullCommand(),
g.UpgradeCmd.FullCommand(),
g.UpdateUploadCmd.FullCommand(),
g.RPCAgentRunCmd.FullCommand(),
g.LeaveCmd.FullCommand(),
g.RemoveCmd.FullCommand(),
Expand Down Expand Up @@ -457,7 +458,8 @@ func Execute(g *Application, cmd string, extraArgs []string) (err error) {
case g.StatusHistoryCmd.FullCommand():
return statusHistory()
case g.UpdateUploadCmd.FullCommand():
return uploadUpdate(localEnv, *g.UpdateUploadCmd.OpsCenterURL)
return uploadUpdate(context.Background(), localEnv,
*g.UpdateUploadCmd.OpsCenterURL)
case g.AppPackageCmd.FullCommand():
return appPackage(localEnv)
// app commands
Expand Down

0 comments on commit eba6a82

Please sign in to comment.