Skip to content

Commit

Permalink
client: recover from getter panics (#14696) (#14705)
Browse files Browse the repository at this point in the history
The artifact getter uses the go-getter library to fetch files from
different sources. Any bug in this library that results in a panic can
cause the entire Nomad client to crash due to a single file download
attempt.

This change aims to guard against this types of crashes by recovering
from panics when the getter attempts to download an artifact. The
resulting panic is converted to an error that is stored as a task event
for operator visibility and the panic stack trace is logged to the
client's log.
  • Loading branch information
hc-github-team-nomad-core authored Sep 26, 2022
1 parent fe1dec5 commit bdb3409
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .changelog/14696.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:security
client: recover from panics caused by artifact download to prevent the Nomad client from crashing
```
21 changes: 19 additions & 2 deletions client/allocrunner/taskrunner/getter/getter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import (
"fmt"
"net/http"
"net/url"
"runtime/debug"
"strings"

"github.com/hashicorp/go-cleanhttp"
gg "github.com/hashicorp/go-getter"
"github.com/hashicorp/go-hclog"

"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/interfaces"
Expand All @@ -22,6 +24,8 @@ const (

// Getter wraps go-getter calls in an artifact configuration.
type Getter struct {
logger hclog.Logger

// httpClient is a shared HTTP client for use across all http/https
// Getter instantiations. The HTTP client is designed to be
// thread-safe, and using a pooled transport will help reduce excessive
Expand All @@ -32,8 +36,9 @@ type Getter struct {

// NewGetter returns a new Getter instance. This function is called once per
// client and shared across alloc and task runners.
func NewGetter(config *config.ArtifactConfig) *Getter {
func NewGetter(logger hclog.Logger, config *config.ArtifactConfig) *Getter {
return &Getter{
logger: logger,
httpClient: &http.Client{
Transport: cleanhttp.DefaultPooledTransport(),
},
Expand All @@ -42,7 +47,19 @@ func NewGetter(config *config.ArtifactConfig) *Getter {
}

// GetArtifact downloads an artifact into the specified task directory.
func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) error {
func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) (returnErr error) {
// Recover from panics to avoid crashing the entire Nomad client due to
// artifact download failures, such as bugs in go-getter.
defer func() {
if r := recover(); r != nil {
g.logger.Error("panic while downloading artifact",
"artifact", artifact.GetterSource,
"error", r,
"stack", string(debug.Stack()))
returnErr = fmt.Errorf("getter panic: %v", r)
}
}()

ggURL, err := getGetterUrl(taskEnv, artifact)
if err != nil {
return newGetError(artifact.GetterSource, err, false)
Expand Down
25 changes: 24 additions & 1 deletion client/allocrunner/taskrunner/getter/getter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"time"

gg "github.com/hashicorp/go-getter"
"github.com/hashicorp/go-hclog"
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/interfaces"
"github.com/hashicorp/nomad/client/taskenv"
Expand Down Expand Up @@ -56,6 +57,19 @@ func noopTaskEnv(taskDir string) interfaces.EnvReplacer {
}
}

// panicReplacer is a version of taskenv.TaskEnv.ReplaceEnv that panics.
type panicReplacer struct{}

func (panicReplacer) ReplaceEnv(_ string) string {
panic("panic")
}
func (panicReplacer) ClientPath(_ string, _ bool) (string, bool) {
panic("panic")
}
func panicTaskEnv() interfaces.EnvReplacer {
return panicReplacer{}
}

// upperReplacer is a version of taskenv.TaskEnv.ReplaceEnv that upper-cases
// the given input.
type upperReplacer struct {
Expand All @@ -76,7 +90,7 @@ func removeAllT(t *testing.T, path string) {
}

func TestGetter_getClient(t *testing.T) {
getter := NewGetter(&clientconfig.ArtifactConfig{
getter := NewGetter(hclog.NewNullLogger(), &clientconfig.ArtifactConfig{
HTTPReadTimeout: time.Minute,
HTTPMaxBytes: 100_000,
GCSTimeout: 1 * time.Minute,
Expand Down Expand Up @@ -464,6 +478,15 @@ func TestGetArtifact_Setuid(t *testing.T) {
}
}

// TestGetArtifact_handlePanic tests that a panic during the getter execution
// does not cause its goroutine to crash.
func TestGetArtifact_handlePanic(t *testing.T) {
getter := TestDefaultGetter(t)
err := getter.GetArtifact(panicTaskEnv(), &structs.TaskArtifact{})
require.Error(t, err)
require.Contains(t, err.Error(), "panic")
}

func TestGetGetterUrl_Queries(t *testing.T) {
cases := []struct {
name string
Expand Down
3 changes: 2 additions & 1 deletion client/allocrunner/taskrunner/getter/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package getter
import (
"testing"

"github.com/hashicorp/go-hclog"
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/stretchr/testify/require"
Expand All @@ -14,5 +15,5 @@ import (
func TestDefaultGetter(t *testing.T) *Getter {
getterConf, err := clientconfig.ArtifactConfigFromAgent(config.DefaultArtifactConfig())
require.NoError(t, err)
return NewGetter(getterConf)
return NewGetter(hclog.NewNullLogger(), getterConf)
}
2 changes: 1 addition & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
serversContactedCh: make(chan struct{}),
serversContactedOnce: sync.Once{},
cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")),
getter: getter.NewGetter(cfg.Artifact),
getter: getter.NewGetter(logger.Named("artifact_getter"), cfg.Artifact),
EnterpriseClient: newEnterpriseClient(logger),
}

Expand Down

0 comments on commit bdb3409

Please sign in to comment.