Skip to content
This repository has been archived by the owner on Feb 9, 2024. It is now read-only.

(7.0) Better errors for teleport nodes pre-check #1903

Merged
merged 1 commit into from
Jul 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lib/ops/opsservice/versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,12 @@ const (
This cluster image does not contain required intermediate runtimes.
Please rebuild it as described in https://gravitational.com/gravity/docs/cluster/#direct-upgrades-from-older-lts-versions.`
)

var (
// TeleportBrokenJoinTokenVersion is version of the release affected by
// the issue with Teleport using incorrect auth token on joined nodes.
//
// Github issue: https://github.com/gravitational/gravity/issues/1445.
// KB: https://community.gravitational.com/t/recover-teleport-nodes-failing-to-join-due-to-bad-token/649.
TeleportBrokenJoinTokenVersion = semver.New("5.5.40")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will also make sense after intermediate upgrades have been ported.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, which is fine since we're working on it.

)
2 changes: 1 addition & 1 deletion tool/gravity/cli/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ func newCollector(env *localenv.LocalEnvironment) (*vacuum.Collector, error) {
ctx := context.TODO()
req := deployAgentsRequest{
clusterState: cluster.ClusterState,
clusterName: cluster.Domain,
cluster: *cluster,
clusterEnv: clusterEnv,
proxy: proxy,
}
Expand Down
83 changes: 65 additions & 18 deletions tool/gravity/cli/rpcagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ import (
"github.com/gravitational/gravity/lib/fsm"
"github.com/gravitational/gravity/lib/loc"
"github.com/gravitational/gravity/lib/localenv"
"github.com/gravitational/gravity/lib/ops"
"github.com/gravitational/gravity/lib/ops/opsservice"
"github.com/gravitational/gravity/lib/pack"
"github.com/gravitational/gravity/lib/rpc"
pb "github.com/gravitational/gravity/lib/rpc/proto"
rpcserver "github.com/gravitational/gravity/lib/rpc/server"
"github.com/gravitational/gravity/lib/schema"
"github.com/gravitational/gravity/lib/storage"
"github.com/gravitational/gravity/lib/update"
clusterupdate "github.com/gravitational/gravity/lib/update/cluster"
Expand Down Expand Up @@ -196,7 +199,7 @@ func rpcAgentDeployHelper(ctx context.Context, localEnv *localenv.LocalEnvironme

req := deployAgentsRequest{
clusterState: cluster.ClusterState,
clusterName: cluster.Domain,
cluster: *cluster,
clusterEnv: clusterEnv,
proxy: proxy,
leaderParams: leaderParams,
Expand All @@ -216,21 +219,18 @@ func rpcAgentDeployHelper(ctx context.Context, localEnv *localenv.LocalEnvironme
return deployAgents(localCtx, localEnv, req)
}

func verifyCluster(ctx context.Context,
clusterState storage.ClusterState,
proxy *teleclient.ProxyClient,
) (servers []rpc.DeployServer, err error) {
func verifyCluster(ctx context.Context, req deployAgentsRequest) (servers []rpc.DeployServer, err error) {
var missing []string
servers = make([]rpc.DeployServer, 0, len(servers))

for _, server := range clusterState.Servers {
for _, server := range req.clusterState.Servers {
deployServer := rpc.NewDeployServer(server)
// do a quick check to make sure we can connect to the teleport node
client, err := proxy.ConnectToNode(ctx, deployServer.NodeAddr,
client, err := req.proxy.ConnectToNode(ctx, deployServer.NodeAddr,
defaults.SSHUser, false)
if err != nil {
log.Errorf("Failed to connect to teleport on node %v: %v.",
deployServer, trace.DebugReport(err))
log.WithError(err).Errorf("Failed to connect to teleport on node %v.",
deployServer)
missing = append(missing, server.Hostname)
} else {
client.Close()
Expand All @@ -240,16 +240,63 @@ func verifyCluster(ctx context.Context,
}
}
if len(missing) != 0 {
return nil, trace.NotFound(
"Teleport is unavailable "+
"on the following cluster nodes: %s. Please "+
"make sure that the Teleport service is running "+
"and try again.", strings.Join(missing, ", "))
base := req.cluster.App.Manifest.Base()
if base != nil && base.Version == opsservice.TeleportBrokenJoinTokenVersion.String() {
return nil, trace.NotFound(teleportTokenMessage,
strings.Join(missing, ", "), base.Version)
}
return nil, trace.NotFound(teleportUnavailableMessage,
strings.Join(missing, ", "), getTeleportVersion(req.cluster.App.Manifest))
}

return servers, nil
}

func getTeleportVersion(manifest schema.Manifest) string {
teleportPackage, err := manifest.Dependencies.ByName(constants.TeleportPackage)
if err == nil {
return teleportPackage.Version
}
return "<version>"
}

const (
// teleportTokenMessage is displayed when some Teleport nodes are
// unavailable during agents deployment due to the issue with incorrect
// Teleport join token.
teleportTokenMessage = `Teleport is unavailable on the following cluster nodes: %[1]s.

Gravity version %[2]v you're currently running has a known issue with Teleport
using an incorrect auth token on the joined nodes preventing Teleport nodes from
joining.

This cluster may be affected by this issue if new nodes were joined to it after
upgrade to %[2]v. See the following KB article for remediation actions:

https://community.gravitational.com/t/recover-teleport-nodes-failing-to-join-due-to-bad-token/649

After fixing the issue, "./gravity status" can be used to confirm the status of
Teleport on each node using "remote access" field.

Once all Teleport nodes have joined successfully, launch the upgrade again.
`
// teleportUnavailableMessage is displayed when some Teleport nodes are
// unavailable during agents deployment.
teleportUnavailableMessage = `Teleport is unavailable on the following cluster nodes: %[1]s.

Please check the status and logs of Teleport systemd service on the specified
nodes and make sure it's running:

systemctl status gravity__gravitational.io__teleport__%[2]v
journalctl -u gravity__gravitational.io__teleport__%[2]v --no-pager

After fixing the issue, "./gravity status" can be used to confirm the status of
Teleport on each node using "remote access" field.

Once all Teleport nodes are running, launch the upgrade again.
`
)

func upsertRPCCredentialsPackage(
servers []rpc.DeployServer,
packages pack.PackageService,
Expand Down Expand Up @@ -293,18 +340,18 @@ func deployAgents(ctx context.Context, env *localenv.LocalEnvironment, req deplo

// newDeployAgentsRequest creates a new request to deploy agents on the local cluster
func newDeployAgentsRequest(ctx context.Context, env *localenv.LocalEnvironment, req deployAgentsRequest) (*rpc.DeployAgentsRequest, error) {
servers, err := verifyCluster(ctx, req.clusterState, req.proxy)
servers, err := verifyCluster(ctx, req)
if err != nil {
return nil, trace.Wrap(err)
}

gravityPackage := getGravityPackage()
secretsPackageTemplate := loc.Locator{
Repository: req.clusterName,
Repository: req.cluster.Domain,
Version: gravityPackage.Version,
}
secretsPackage, err := upsertRPCCredentialsPackage(
servers, req.clusterEnv.ClusterPackages, req.clusterName, secretsPackageTemplate)
servers, req.clusterEnv.ClusterPackages, req.cluster.Domain, secretsPackageTemplate)
if err != nil {
return nil, trace.Wrap(err)
}
Expand Down Expand Up @@ -398,7 +445,7 @@ func getGravityPackage() loc.Locator {
type deployAgentsRequest struct {
clusterEnv *localenv.ClusterEnvironment
clusterState storage.ClusterState
clusterName string
cluster ops.Site
proxy *teleclient.ProxyClient
leaderParams string
leader *storage.Server
Expand Down
2 changes: 1 addition & 1 deletion tool/gravity/cli/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func newUpdater(ctx context.Context, localEnv, updateEnv *localenv.LocalEnvironm
// Use server list from the operation plan to always have a consistent
// view of the cluster (i.e. with servers correctly reflecting cluster roles)
clusterState: clusterStateFromPlan(*plan),
clusterName: cluster.Domain,
cluster: *cluster,
clusterEnv: clusterEnv,
proxy: proxy,
leader: leader,
Expand Down