Skip to content
This repository has been archived by the owner on Feb 9, 2024. It is now read-only.

(6.1) Check cluster status after upload #1709

Merged
merged 2 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions lib/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ import (
pb "github.com/gravitational/satellite/agent/proto/agentpb"
"github.com/gravitational/satellite/monitoring"
"github.com/gravitational/trace"
"github.com/sirupsen/logrus"
)

// FromCluster collects cluster status information.
Expand All @@ -60,30 +59,30 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o

status.Agent, err = FromPlanetAgent(ctx, cluster.ClusterState.Servers)
if err != nil {
logrus.WithError(err).Warn("Failed to collect system status from agents.")
log.WithError(err).Warn("Failed to collect system status from agents.")
}

if err := updateClusterNodes(cluster.Key(), operator, status); err != nil {
logrus.WithError(err).Warn("Failed to query cluster nodes.")
log.WithError(err).Warn("Failed to query cluster nodes.")
}

token, err := operator.GetExpandToken(cluster.Key())
if err != nil && !trace.IsNotFound(err) {
logrus.WithError(err).Warn("Failed to fetch expand token.")
log.WithError(err).Warn("Failed to fetch expand token.")
}
if token != nil {
status.Token = *token
}

status.Cluster.ServerVersion, err = operator.GetVersion(ctx)
if err != nil {
logrus.WithError(err).Warn("Failed to query server version information.")
log.WithError(err).Warn("Failed to query server version information.")
}

// Collect application endpoints.
appEndpoints, err := operator.GetApplicationEndpoints(cluster.Key())
if err != nil {
logrus.WithError(err).Warn("Failed to fetch application endpoints.")
log.WithError(err).Warn("Failed to fetch application endpoints.")
status.Endpoints.Applications.Error = err
}
if len(appEndpoints) != 0 {
Expand All @@ -99,7 +98,7 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o
// Fetch cluster endpoints.
clusterEndpoints, err := ops.GetClusterEndpoints(operator, cluster.Key())
if err != nil {
logrus.WithError(err).Warn("Failed to fetch cluster endpoints.")
log.WithError(err).Warn("Failed to fetch cluster endpoints.")
}
if clusterEndpoints != nil {
status.Endpoints.Cluster.AuthGateway = clusterEndpoints.AuthGateways()
Expand All @@ -108,15 +107,15 @@ func FromCluster(ctx context.Context, operator ops.Operator, cluster ops.Site, o

// FIXME: have status extension accept the operator/environment
if err := status.Cluster.Extension.Collect(); err != nil {
logrus.WithError(err).Warn("Failed to query extension metadata.")
log.WithError(err).Warn("Failed to query extension metadata.")
}

if err := collectActiveOperations(cluster.Key(), operator, status); err != nil {
logrus.WithError(err).Warn("Failed to query active operations.")
log.WithError(err).Warn("Failed to query active operations.")
}

if err := fetchOperationByID(cluster.Key(), operationID, operator, status); err != nil {
logrus.WithError(err).WithField("operation-id", operationID).Warn("Failed to query operation.")
log.WithError(err).WithField("operation-id", operationID).Warn("Failed to query operation.")
}

status.State = cluster.State
Expand Down Expand Up @@ -180,6 +179,23 @@ func (r Status) IsDegraded() bool {
r.Agent.GetSystemStatus() != pb.SystemStatus_Running)
}

// String returns the status string representation.
func (r Status) String() string {
var cluster string
if r.Cluster != nil {
cluster = fmt.Sprintf("Cluster(%v)", *r.Cluster)
} else {
cluster = "Cluster(nil)"
}
var agent string
if r.Agent != nil {
agent = fmt.Sprintf("Agent(%v)", *r.Agent)
} else {
agent = "Agent(nil)"
}
return fmt.Sprintf("Status(%v, %v)", cluster, agent)
}

// Status describes the status of the cluster as a whole
type Status struct {
// Cluster describes the operational status of the cluster
Expand Down Expand Up @@ -638,7 +654,7 @@ func probeErrorDetail(p pb.Probe) string {
if err == nil {
return detail
}
logrus.WithError(err).Warn("Failed to compose disk space probe error.")
log.WithError(err).Warn("Failed to compose disk space probe error.")
}
detail := p.Detail
if p.Detail == "" {
Expand Down
1 change: 0 additions & 1 deletion lib/status/timeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"github.com/fatih/color"
pb "github.com/gravitational/satellite/agent/proto/agentpb"
"github.com/gravitational/trace"
log "github.com/sirupsen/logrus"
)

// Timeline queries the currently stored cluster timeline.
Expand Down
51 changes: 51 additions & 0 deletions lib/status/wait.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
Copyright 2019 Gravitational, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package status

import (
"context"

"github.com/gravitational/gravity/lib/defaults"
"github.com/gravitational/gravity/lib/ops"
"github.com/gravitational/gravity/lib/utils"

"github.com/gravitational/trace"
"github.com/sirupsen/logrus"
)

var log = logrus.WithField(trace.Component, "status")
r0mant marked this conversation as resolved.
Show resolved Hide resolved

// WaitCluster blocks until the local cluster is healthy or until the context
// expires.
func WaitCluster(ctx context.Context, operator ops.Operator) error {
b := utils.NewExponentialBackOff(defaults.NodeStatusTimeout)
return utils.RetryWithInterval(ctx, b, func() error {
cluster, err := operator.GetLocalSite()
if err != nil {
return trace.Wrap(err)
}
status, err := FromCluster(ctx, operator, *cluster, "")
if err != nil {
return trace.Wrap(err)
}
if status.IsDegraded() {
return trace.BadParameter("cluster is not healthy: %s", status)
}
log.Info("Cluster is healthy.")
return nil
})
}
25 changes: 22 additions & 3 deletions tool/gravity/cli/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/gravitational/gravity/lib/pack"
"github.com/gravitational/gravity/lib/pack/encryptedpack"
"github.com/gravitational/gravity/lib/state"
libstatus "github.com/gravitational/gravity/lib/status"
"github.com/gravitational/gravity/lib/storage"
"github.com/gravitational/gravity/lib/users"
"github.com/gravitational/gravity/lib/utils"
Expand Down Expand Up @@ -86,7 +87,7 @@ func appPackage(env *localenv.LocalEnvironment) error {
return nil
}

func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
func uploadUpdate(ctx context.Context, env *localenv.LocalEnvironment, opsURL string) error {
// create local environment with gravity state dir because the environment
// provided above has upgrade tarball as a state dir
localStateDir, err := localenv.LocalGravityDir()
Expand Down Expand Up @@ -170,7 +171,7 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {

var registries []string
err = utils.Retry(defaults.RetryInterval, defaults.RetryLessAttempts, func() error {
registries, err = getRegistries(context.TODO(), defaultEnv, cluster.ClusterState.Servers)
registries, err = getRegistries(ctx, defaultEnv, cluster.ClusterState.Servers)
return trace.Wrap(err)
})
if err != nil {
Expand All @@ -196,7 +197,7 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
if err != nil {
return trace.Wrap(err)
}
err = appservice.SyncApp(context.TODO(), appservice.SyncRequest{
err = appservice.SyncApp(ctx, appservice.SyncRequest{
PackService: tarballPackages,
AppService: tarballApps,
ImageService: imageService,
Expand All @@ -207,6 +208,24 @@ func uploadUpdate(env *localenv.LocalEnvironment, opsURL string) error {
}
}

// Uploading new blobs to the cluster is known to cause stress on disk
// which can lead to the cluster's health checker experiencing momentary
// blips and potentially moving the cluster to degraded state, especially
// when running on a hardware with sub-par I/O performance.
//
// To accommodate this behavior and make sure upgrade (which normally
// follows upload right away) does not fail to launch due to the degraded
// state, give the cluster a few minutes to settle.
//
// See https://github.com/gravitational/gravity/issues/1659 for more info.
env.PrintStep("Verifying cluster health")
ctx, cancel := context.WithTimeout(ctx, defaults.NodeStatusTimeout)
defer cancel()
err = libstatus.WaitCluster(ctx, clusterOperator)
if err != nil {
return trace.Wrap(err)
}

env.PrintStep("Application has been uploaded")
return nil
}
Expand Down
4 changes: 3 additions & 1 deletion tool/gravity/cli/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func InitAndCheck(g *Application, cmd string) error {
g.UpdateTriggerCmd.FullCommand(),
g.UpdatePlanInitCmd.FullCommand(),
g.UpgradeCmd.FullCommand(),
g.UpdateUploadCmd.FullCommand(),
g.RPCAgentRunCmd.FullCommand(),
g.LeaveCmd.FullCommand(),
g.RemoveCmd.FullCommand(),
Expand Down Expand Up @@ -457,7 +458,8 @@ func Execute(g *Application, cmd string, extraArgs []string) (err error) {
case g.StatusHistoryCmd.FullCommand():
return statusHistory()
case g.UpdateUploadCmd.FullCommand():
return uploadUpdate(localEnv, *g.UpdateUploadCmd.OpsCenterURL)
return uploadUpdate(context.Background(), localEnv,
*g.UpdateUploadCmd.OpsCenterURL)
case g.AppPackageCmd.FullCommand():
return appPackage(localEnv)
// app commands
Expand Down