Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VAULT-31409: trace postUnseal function #28895

Merged
merged 20 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/28895.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
core: Added new `enable_post_unseal_trace` and `post_unseal_trace_directory` config options to generate Go traces during the post-unseal step for debug purposes.
```
16 changes: 16 additions & 0 deletions command/server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ type Config struct {
License string `hcl:"-"`
LicensePath string `hcl:"license_path"`
DisableSSCTokens bool `hcl:"-"`

EnablePostUnsealTrace bool `hcl:"enable_post_unseal_trace"`
PostUnsealTraceDir string `hcl:"post_unseal_trace_directory"`
}

const (
Expand Down Expand Up @@ -425,6 +428,16 @@ func (c *Config) Merge(c2 *Config) *Config {
result.LicensePath = c2.LicensePath
}

result.EnablePostUnsealTrace = c.EnablePostUnsealTrace
if c2.EnablePostUnsealTrace {
result.EnablePostUnsealTrace = c2.EnablePostUnsealTrace
}

result.PostUnsealTraceDir = c.PostUnsealTraceDir
if c2.PostUnsealTraceDir != "" {
result.PostUnsealTraceDir = c2.PostUnsealTraceDir
}

// Use values from top-level configuration for storage if set
if storage := result.Storage; storage != nil {
if result.APIAddr != "" {
Expand Down Expand Up @@ -1150,6 +1163,9 @@ func (c *Config) Sanitized() map[string]interface{} {
"detect_deadlocks": c.DetectDeadlocks,

"imprecise_lease_role_tracking": c.ImpreciseLeaseRoleTracking,

"enable_post_unseal_trace": c.EnablePostUnsealTrace,
"post_unseal_trace_directory": c.PostUnsealTraceDir,
}
for k, v := range sharedResult {
result[k] = v
Expand Down
63 changes: 63 additions & 0 deletions helper/trace/debug_trace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package trace
bosouza marked this conversation as resolved.
Show resolved Hide resolved

import (
"fmt"
"os"
"path/filepath"
"runtime/trace"
"time"
)

func StartDebugTrace(dir string, filePrefix string) (file string, stop func() error, err error) {
dirMustExist := true
if dir == "" {
dirMustExist = false // if a dir is provided it must exist, otherwise we'll create a default one
dir = filepath.Join(os.TempDir(), "vault-traces")
}

d, err := os.Stat(dir)
if err != nil && !os.IsNotExist(err) {
return "", nil, fmt.Errorf("failed to stat trace directory %q: %s", dir, err)
}

if os.IsNotExist(err) && dirMustExist {
return "", nil, fmt.Errorf("trace directory %q does not exist", dir)
}

if !os.IsNotExist(err) && !d.IsDir() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need the !os.IsNotExist(err) portion of this condition?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I think we need it. If os.IsNotExist(err) evaluates to true then we don't want to evaluate d.IsDir() since d would be nil.

The alternative would be to use a nested if-else structure that interleaves error returns with additional operations (creation of default dir), which I don't like as much:

	d, err := os.Stat(dir)
	if err != nil {
		if os.IsNotExist(err) {
			if dirMustExist {
				return "", nil, fmt.Errorf("trace directory %q does not exist", dir)
			} else {
				if err := os.Mkdir(dir, 0o700); err != nil {
					return "", nil, fmt.Errorf("failed to create trace directory %q: %s", dir, err)
				}
			}
		} else {
			return "", nil, fmt.Errorf("failed to stat trace directory %q: %s", dir, err)
		}
	} else if !d.IsDir() {
		return "", nil, fmt.Errorf("trace directory %q is not a directory", dir)
	}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, it's good as is!

return "", nil, fmt.Errorf("trace directory %q is not a directory", dir)
}

if os.IsNotExist(err) {
if err := os.Mkdir(dir, 0o700); err != nil {
return "", nil, fmt.Errorf("failed to create trace directory %q: %s", dir, err)
}
}

// would prefer a more human readable time reference in the file name but the column
// character can cause problems in filenames
fileName := fmt.Sprintf("%s-%d.trace", filePrefix, time.Now().Unix())
traceFile, err := filepath.Abs(filepath.Join(dir, fileName))
if err != nil {
return "", nil, fmt.Errorf("failed to get absolute path for trace file: %s", err)
}
f, err := os.Create(traceFile)
if err != nil {
return "", nil, fmt.Errorf("failed to create trace file %q: %s", traceFile, err)
}

if err := trace.Start(f); err != nil {
f.Close()
return "", nil, fmt.Errorf("failed to start trace: %s", err)
bosouza marked this conversation as resolved.
Show resolved Hide resolved
}

stop = func() error {
trace.Stop()
return f.Close()
}

return f.Name(), stop, nil
}
111 changes: 111 additions & 0 deletions helper/trace/debug_trace_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package trace

import (
"os"
"path/filepath"
"runtime"
"testing"

"github.com/stretchr/testify/require"
)

// TestStartDebugTrace tests the debug trace functionality creating real
// files and traces.
func TestStartDebugTrace(t *testing.T) {
t.Run("error_on_non_existent_dir", func(t *testing.T) {
_, _, err := StartDebugTrace("non-existent-dir", "filePrefix")
require.Error(t, err)
require.Contains(t, err.Error(), "does not exist")
})

t.Run("error_on_non_dir", func(t *testing.T) {
f, err := os.CreateTemp("", "")
require.NoError(t, err)
require.NoError(t, f.Close())
_, _, err = StartDebugTrace(f.Name(), "")
require.Error(t, err)
require.Contains(t, err.Error(), "is not a directory")
})

t.Run("error_on_failed_to_create_trace_file", func(t *testing.T) {
noWriteFolder := filepath.Join(os.TempDir(), "no-write-permissions")
bosouza marked this conversation as resolved.
Show resolved Hide resolved
// create folder without write permission
err := os.Mkdir(noWriteFolder, 0o000)
t.Cleanup(func() {
os.RemoveAll(noWriteFolder)
})
require.NoError(t, err)
_, _, err = StartDebugTrace(noWriteFolder, "")
require.Error(t, err)
require.Contains(t, err.Error(), "failed to create trace file")
})

t.Run("error_trying_to_start_second_concurrent_trace", func(t *testing.T) {
dir, err := os.MkdirTemp("", "")
require.NoError(t, err)
t.Cleanup(func() {
os.RemoveAll(dir)
})
_, stop, err := StartDebugTrace(dir, "filePrefix")
require.NoError(t, err)
_, stopNil, err := StartDebugTrace(dir, "filePrefix")
require.Error(t, err)
require.Contains(t, err.Error(), "failed to start trace")
require.NoError(t, stop())
require.Nil(t, stopNil)
})

t.Run("error_when_stating_tmp_dir_with_restricted_permissions", func(t *testing.T) {
// this test relies on setting TMPDIR so skip it if we're not on a Unix system
if runtime.GOOS == "windows" {
t.Skip("skipping test on Windows")
}

tmpMissingPermissions := filepath.Join(t.TempDir(), "missing_permissions")
err := os.Mkdir(tmpMissingPermissions, 0o000)
require.NoError(t, err)
t.Setenv("TMPDIR", tmpMissingPermissions)
_, _, err = StartDebugTrace("", "filePrefix")
require.Error(t, err)
require.Contains(t, err.Error(), "failed to stat trace directory")
})

t.Run("successful_trace_generates_non_empty_file", func(t *testing.T) {
dir, err := os.MkdirTemp("", "")
require.NoError(t, err)
t.Cleanup(func() {
os.RemoveAll(dir)
})
file, stop, err := StartDebugTrace(dir, "filePrefix")
require.NoError(t, err)
require.NoError(t, stop())
f, err := os.Stat(file)
require.NoError(t, err)
require.Greater(t, f.Size(), int64(0))
})

t.Run("successful_creation_of_tmp_dir", func(t *testing.T) {
os.RemoveAll(filepath.Join(os.TempDir(), "vault-traces"))
file, stop, err := StartDebugTrace("", "filePrefix")
require.NoError(t, err)
require.NoError(t, stop())
require.Contains(t, file, filepath.Join(os.TempDir(), "vault-traces", "filePrefix"))
f, err := os.Stat(file)
require.NoError(t, err)
require.Greater(t, f.Size(), int64(0))
})

t.Run("successful_trace_with_existing_tmp_dir", func(t *testing.T) {
os.Mkdir(filepath.Join(os.TempDir(), "vault-traces"), 0o700)
file, stop, err := StartDebugTrace("", "filePrefix")
require.NoError(t, err)
require.NoError(t, stop())
require.Contains(t, file, filepath.Join(os.TempDir(), "vault-traces", "filePrefix"))
f, err := os.Stat(file)
require.NoError(t, err)
require.Greater(t, f.Size(), int64(0))
})
}
40 changes: 40 additions & 0 deletions vault/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/helper/namespace"
"github.com/hashicorp/vault/helper/osutil"
"github.com/hashicorp/vault/helper/trace"
"github.com/hashicorp/vault/physical/raft"
"github.com/hashicorp/vault/sdk/helper/certutil"
"github.com/hashicorp/vault/sdk/helper/consts"
Expand Down Expand Up @@ -2704,6 +2705,10 @@ func (c *Core) runUnsealSetupForPrimary(ctx context.Context, logger log.Logger)
// requires the Vault to be unsealed such as mount tables, logical backends,
// credential stores, etc.
func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc, unsealer UnsealStrategy) (retErr error) {
if stopTrace := c.tracePostUnsealIfEnabled(); stopTrace != nil {
defer stopTrace()
}

defer metrics.MeasureSince([]string{"core", "post_unseal"}, time.Now())
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what is the best order between this metric and the new trace, but I think it's fair to include the metric in the captured trace even tho that will make the time spend on the tracing setup invisible to the metric


// Clear any out
Expand Down Expand Up @@ -2819,6 +2824,41 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
return nil
}

// tracePostUnsealIfEnabled checks if post-unseal tracing is enabled in the server
// config and starts a go trace if it is, returning a stop function to be called once
// the post-unseal process is complete.
func (c *Core) tracePostUnsealIfEnabled() (stop func()) {
// use rawConfig to allow config hot-reload of EnablePostUnsealTrace via SIGHUP
conf := c.rawConfig.Load()
if conf == nil {
c.logger.Warn("failed to get raw config to check enable_post_unseal_trace")
return nil
}

if !conf.(*server.Config).EnablePostUnsealTrace {
return nil
}

dir := conf.(*server.Config).PostUnsealTraceDir

traceFile, stopTrace, err := trace.StartDebugTrace(dir, "post-unseal")
if err != nil {
c.logger.Warn("failed to start post-unseal trace", "error", err)
return nil
}

c.logger.Info("post-unseal trace started", "file", traceFile)

return func() {
err := stopTrace()
if err != nil {
c.logger.Warn("failure when stopping post-unseal trace", "error", err)
return
}
c.logger.Info("post-unseal trace completed", "file", traceFile)
}
}

// preSeal is invoked before the barrier is sealed, allowing
// for any state teardown required.
func (c *Core) preSeal() error {
Expand Down
23 changes: 22 additions & 1 deletion vault/external_tests/pprof/pprof_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,33 @@ import (
"encoding/json"
"io"
"net/http"
"os"
"strconv"
"strings"
"testing"

"github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/vault/command/server"
vaulthttp "github.com/hashicorp/vault/http"
"github.com/hashicorp/vault/sdk/helper/testhelpers/schema"
"github.com/hashicorp/vault/vault"
"github.com/stretchr/testify/require"
"golang.org/x/net/http2"
)

func TestSysPprof(t *testing.T) {
t.Parallel()
cluster := vault.NewTestCluster(t, nil, &vault.TestClusterOptions{

// trace test setup
dir, err := os.MkdirTemp("", "vault-trace-test")
require.NoError(t, err)

cluster := vault.NewTestCluster(t, &vault.CoreConfig{
RawConfig: &server.Config{
EnablePostUnsealTrace: true,
PostUnsealTraceDir: dir,
},
}, &vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
RequestResponseCallback: schema.ResponseValidatingCallback(t),
})
Expand All @@ -30,6 +43,14 @@ func TestSysPprof(t *testing.T) {
core := cluster.Cores[0].Core
vault.TestWaitActive(t, core)
SysPprof_Test(t, cluster)

// draft trace test onto pprof one to avoid increasing test runtime with additional clusters
files, err := os.ReadDir(dir)
require.NoError(t, err)
require.Greater(t, len(files), 0)
traceFile, err := files[0].Info()
require.NoError(t, err)
require.Greater(t, traceFile.Size(), int64(0))
}

func TestSysPprof_MaxRequestDuration(t *testing.T) {
Expand Down
11 changes: 11 additions & 0 deletions website/content/docs/configuration/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,17 @@ can have a negative effect on performance due to the tracking of each lock attem
When `imprecise_lease_role_tracking` is set to true and a new role-based quota is enabled, subsequent lease counts start from 0.
`imprecise_lease_role_tracking` affects role-based lease count quotas, but reduces latencies when not using role based quotas.

- `enable_post_unseal_trace` `(bool: false)` - Enables the server to generate a Go trace during the execution of the
`core.postUnseal` function for debug purposes. The resulting trace can be viewed with the `go tool trace` command. The output
directory can be specified with the `post_unseal_trace_directory` parameter. This should only be enabled temporarily for
debugging purposes as it can have a significant performance impact. This can be updated on a running Vault process with a
SIGHUP signal.

- `post_unseal_trace_directory` `(string: "")` - Specifies the directory where the trace file will be written, which must exist
and be writable by the Vault process. If not specified it will create a subdirectory `vault-traces` under the result from
[os.TempDir()](https://pkg.go.dev/os#TempDir) (usually `/tmp` on Unix systems). This can be updated on a running Vault process
with a SIGHUP signal.

### High availability parameters

The following parameters are used on backends that support [high availability][high-availability].
Expand Down
Loading