From a2c467cc22c6e4a2e1b001a05d45a1383393669b Mon Sep 17 00:00:00 2001 From: Bruno Oliveira de Souza Date: Tue, 26 Nov 2024 15:04:34 -0300 Subject: [PATCH] VAULT-31409: trace postUnseal function (#28895) * initial implementation of unseal trace * close file if we fail to start the trace didn't bother to check the error from traceFile.Close() * use reloadable config instead of env var * license * remove leftover * allow setting custom dir and remove new package * bring back StartDebugTrace after talking to Kuba it sounds like it's a good idea to try to move stuff out of core, so even if there's no immediate need for a generic debug trace function it's still fair to add it * track postUnseal instead of unsealInternal also some usability improvements from manual testing * address PR comments * address security review there were concerns about using the /tmp directory because of permissions, or having a default dir at all, so now it's required to set a dir in order to generate the traces. * add unit tests to StartDebugTrace * move back to default dir * document new parameters * add tiny integration test * avoid column in trace filename sounds like it might be forbidden in Windows and possibly cause problems in some MacOS applications. * address PR feedback * add go doc to test CI was complaining about missing comments on the new test function. It feels a bit silly to require this of tests but whatever XD * fix tests --- changelog/28895.txt | 3 + command/server/config.go | 16 +++ command/server/config_test_helpers.go | 2 + command/server/test-fixtures/config3.hcl | 2 + helper/trace/debug_trace.go | 63 +++++++++++ helper/trace/debug_trace_test.go | 111 +++++++++++++++++++ http/sys_config_state_test.go | 2 + vault/core.go | 40 +++++++ vault/external_tests/pprof/pprof_test.go | 23 +++- website/content/docs/configuration/index.mdx | 11 ++ 10 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 changelog/28895.txt create mode 100644 helper/trace/debug_trace.go create mode 100644 helper/trace/debug_trace_test.go diff --git a/changelog/28895.txt b/changelog/28895.txt new file mode 100644 index 000000000000..d084c6dfbee5 --- /dev/null +++ b/changelog/28895.txt @@ -0,0 +1,3 @@ +```release-note:improvement +core: Added new `enable_post_unseal_trace` and `post_unseal_trace_directory` config options to generate Go traces during the post-unseal step for debug purposes. +``` diff --git a/command/server/config.go b/command/server/config.go index 9d31ed67b31e..4c1951c0249a 100644 --- a/command/server/config.go +++ b/command/server/config.go @@ -115,6 +115,9 @@ type Config struct { License string `hcl:"-"` LicensePath string `hcl:"license_path"` DisableSSCTokens bool `hcl:"-"` + + EnablePostUnsealTrace bool `hcl:"enable_post_unseal_trace"` + PostUnsealTraceDir string `hcl:"post_unseal_trace_directory"` } const ( @@ -425,6 +428,16 @@ func (c *Config) Merge(c2 *Config) *Config { result.LicensePath = c2.LicensePath } + result.EnablePostUnsealTrace = c.EnablePostUnsealTrace + if c2.EnablePostUnsealTrace { + result.EnablePostUnsealTrace = c2.EnablePostUnsealTrace + } + + result.PostUnsealTraceDir = c.PostUnsealTraceDir + if c2.PostUnsealTraceDir != "" { + result.PostUnsealTraceDir = c2.PostUnsealTraceDir + } + // Use values from top-level configuration for storage if set if storage := result.Storage; storage != nil { if result.APIAddr != "" { @@ -1150,6 +1163,9 @@ func (c *Config) Sanitized() map[string]interface{} { "detect_deadlocks": c.DetectDeadlocks, "imprecise_lease_role_tracking": c.ImpreciseLeaseRoleTracking, + + "enable_post_unseal_trace": c.EnablePostUnsealTrace, + "post_unseal_trace_directory": c.PostUnsealTraceDir, } for k, v := range sharedResult { result[k] = v diff --git a/command/server/config_test_helpers.go b/command/server/config_test_helpers.go index c017e15ec9d8..258801dbfe4e 100644 --- a/command/server/config_test_helpers.go +++ b/command/server/config_test_helpers.go @@ -854,6 +854,8 @@ func testConfig_Sanitized(t *testing.T) { }, "administrative_namespace_path": "admin/", "imprecise_lease_role_tracking": false, + "enable_post_unseal_trace": true, + "post_unseal_trace_directory": "/tmp", } addExpectedEntSanitizedConfig(expected, []string{"http"}) diff --git a/command/server/test-fixtures/config3.hcl b/command/server/test-fixtures/config3.hcl index 587698b35e9e..1d29c6bbd029 100644 --- a/command/server/test-fixtures/config3.hcl +++ b/command/server/test-fixtures/config3.hcl @@ -58,3 +58,5 @@ raw_storage_endpoint = true disable_sealwrap = true disable_sentinel_trace = true administrative_namespace_path = "admin/" +enable_post_unseal_trace = true +post_unseal_trace_directory = "/tmp" diff --git a/helper/trace/debug_trace.go b/helper/trace/debug_trace.go new file mode 100644 index 000000000000..0dca7d7c9e7c --- /dev/null +++ b/helper/trace/debug_trace.go @@ -0,0 +1,63 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package trace + +import ( + "fmt" + "os" + "path/filepath" + "runtime/trace" + "time" +) + +func StartDebugTrace(dir string, filePrefix string) (file string, stop func() error, err error) { + dirMustExist := true + if dir == "" { + dirMustExist = false // if a dir is provided it must exist, otherwise we'll create a default one + dir = filepath.Join(os.TempDir(), "vault-traces") + } + + d, err := os.Stat(dir) + if err != nil && !os.IsNotExist(err) { + return "", nil, fmt.Errorf("failed to stat trace directory %q: %s", dir, err) + } + + if os.IsNotExist(err) && dirMustExist { + return "", nil, fmt.Errorf("trace directory %q does not exist", dir) + } + + if !os.IsNotExist(err) && !d.IsDir() { + return "", nil, fmt.Errorf("trace directory %q is not a directory", dir) + } + + if os.IsNotExist(err) { + if err := os.Mkdir(dir, 0o700); err != nil { + return "", nil, fmt.Errorf("failed to create trace directory %q: %s", dir, err) + } + } + + // would prefer a more human readable time reference in the file name but the column + // character can cause problems in filenames + fileName := fmt.Sprintf("%s-%d.trace", filePrefix, time.Now().Unix()) + traceFile, err := filepath.Abs(filepath.Join(dir, fileName)) + if err != nil { + return "", nil, fmt.Errorf("failed to get absolute path for trace file: %s", err) + } + f, err := os.Create(traceFile) + if err != nil { + return "", nil, fmt.Errorf("failed to create trace file %q: %s", traceFile, err) + } + + if err := trace.Start(f); err != nil { + f.Close() + return "", nil, fmt.Errorf("failed to start trace: %s", err) + } + + stop = func() error { + trace.Stop() + return f.Close() + } + + return f.Name(), stop, nil +} diff --git a/helper/trace/debug_trace_test.go b/helper/trace/debug_trace_test.go new file mode 100644 index 000000000000..c624f81b8b5d --- /dev/null +++ b/helper/trace/debug_trace_test.go @@ -0,0 +1,111 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package trace + +import ( + "os" + "path/filepath" + "runtime" + "testing" + + "github.com/stretchr/testify/require" +) + +// TestStartDebugTrace tests the debug trace functionality creating real +// files and traces. +func TestStartDebugTrace(t *testing.T) { + t.Run("error_on_non_existent_dir", func(t *testing.T) { + _, _, err := StartDebugTrace("non-existent-dir", "filePrefix") + require.Error(t, err) + require.Contains(t, err.Error(), "does not exist") + }) + + t.Run("error_on_non_dir", func(t *testing.T) { + f, err := os.CreateTemp("", "") + require.NoError(t, err) + require.NoError(t, f.Close()) + _, _, err = StartDebugTrace(f.Name(), "") + require.Error(t, err) + require.Contains(t, err.Error(), "is not a directory") + }) + + t.Run("error_on_failed_to_create_trace_file", func(t *testing.T) { + noWriteFolder := filepath.Join(os.TempDir(), "no-write-permissions") + // create folder without write permission + err := os.Mkdir(noWriteFolder, 0o000) + t.Cleanup(func() { + os.RemoveAll(noWriteFolder) + }) + require.NoError(t, err) + _, _, err = StartDebugTrace(noWriteFolder, "") + require.Error(t, err) + require.Contains(t, err.Error(), "failed to create trace file") + }) + + t.Run("error_trying_to_start_second_concurrent_trace", func(t *testing.T) { + dir, err := os.MkdirTemp("", "") + require.NoError(t, err) + t.Cleanup(func() { + os.RemoveAll(dir) + }) + _, stop, err := StartDebugTrace(dir, "filePrefix") + require.NoError(t, err) + _, stopNil, err := StartDebugTrace(dir, "filePrefix") + require.Error(t, err) + require.Contains(t, err.Error(), "failed to start trace") + require.NoError(t, stop()) + require.Nil(t, stopNil) + }) + + t.Run("error_when_stating_tmp_dir_with_restricted_permissions", func(t *testing.T) { + // this test relies on setting TMPDIR so skip it if we're not on a Unix system + if runtime.GOOS == "windows" { + t.Skip("skipping test on Windows") + } + + tmpMissingPermissions := filepath.Join(t.TempDir(), "missing_permissions") + err := os.Mkdir(tmpMissingPermissions, 0o000) + require.NoError(t, err) + t.Setenv("TMPDIR", tmpMissingPermissions) + _, _, err = StartDebugTrace("", "filePrefix") + require.Error(t, err) + require.Contains(t, err.Error(), "failed to stat trace directory") + }) + + t.Run("successful_trace_generates_non_empty_file", func(t *testing.T) { + dir, err := os.MkdirTemp("", "") + require.NoError(t, err) + t.Cleanup(func() { + os.RemoveAll(dir) + }) + file, stop, err := StartDebugTrace(dir, "filePrefix") + require.NoError(t, err) + require.NoError(t, stop()) + f, err := os.Stat(file) + require.NoError(t, err) + require.Greater(t, f.Size(), int64(0)) + }) + + t.Run("successful_creation_of_tmp_dir", func(t *testing.T) { + os.RemoveAll(filepath.Join(os.TempDir(), "vault-traces")) + file, stop, err := StartDebugTrace("", "filePrefix") + require.NoError(t, err) + require.NoError(t, stop()) + require.Contains(t, file, filepath.Join(os.TempDir(), "vault-traces", "filePrefix")) + f, err := os.Stat(file) + require.NoError(t, err) + require.Greater(t, f.Size(), int64(0)) + }) + + t.Run("successful_trace_with_existing_tmp_dir", func(t *testing.T) { + os.Mkdir(filepath.Join(os.TempDir(), "vault-traces"), 0o700) + file, stop, err := StartDebugTrace("", "filePrefix") + require.NoError(t, err) + require.NoError(t, stop()) + require.Contains(t, file, filepath.Join(os.TempDir(), "vault-traces", "filePrefix")) + f, err := os.Stat(file) + require.NoError(t, err) + require.Greater(t, f.Size(), int64(0)) + }) +} diff --git a/http/sys_config_state_test.go b/http/sys_config_state_test.go index 8081aaf642c9..4fb922ef7ed8 100644 --- a/http/sys_config_state_test.go +++ b/http/sys_config_state_test.go @@ -177,6 +177,8 @@ func TestSysConfigState_Sanitized(t *testing.T) { "storage": tc.expectedStorageOutput, "administrative_namespace_path": "", "imprecise_lease_role_tracking": false, + "enable_post_unseal_trace": false, + "post_unseal_trace_directory": "", } if tc.expectedHAStorageOutput != nil { diff --git a/vault/core.go b/vault/core.go index 12a9a9116936..706059667e85 100644 --- a/vault/core.go +++ b/vault/core.go @@ -50,6 +50,7 @@ import ( "github.com/hashicorp/vault/helper/metricsutil" "github.com/hashicorp/vault/helper/namespace" "github.com/hashicorp/vault/helper/osutil" + "github.com/hashicorp/vault/helper/trace" "github.com/hashicorp/vault/physical/raft" "github.com/hashicorp/vault/sdk/helper/certutil" "github.com/hashicorp/vault/sdk/helper/consts" @@ -2704,6 +2705,10 @@ func (c *Core) runUnsealSetupForPrimary(ctx context.Context, logger log.Logger) // requires the Vault to be unsealed such as mount tables, logical backends, // credential stores, etc. func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc, unsealer UnsealStrategy) (retErr error) { + if stopTrace := c.tracePostUnsealIfEnabled(); stopTrace != nil { + defer stopTrace() + } + defer metrics.MeasureSince([]string{"core", "post_unseal"}, time.Now()) // Clear any out @@ -2819,6 +2824,41 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc, return nil } +// tracePostUnsealIfEnabled checks if post-unseal tracing is enabled in the server +// config and starts a go trace if it is, returning a stop function to be called once +// the post-unseal process is complete. +func (c *Core) tracePostUnsealIfEnabled() (stop func()) { + // use rawConfig to allow config hot-reload of EnablePostUnsealTrace via SIGHUP + conf := c.rawConfig.Load() + if conf == nil { + c.logger.Warn("failed to get raw config to check enable_post_unseal_trace") + return nil + } + + if !conf.(*server.Config).EnablePostUnsealTrace { + return nil + } + + dir := conf.(*server.Config).PostUnsealTraceDir + + traceFile, stopTrace, err := trace.StartDebugTrace(dir, "post-unseal") + if err != nil { + c.logger.Warn("failed to start post-unseal trace", "error", err) + return nil + } + + c.logger.Info("post-unseal trace started", "file", traceFile) + + return func() { + err := stopTrace() + if err != nil { + c.logger.Warn("failure when stopping post-unseal trace", "error", err) + return + } + c.logger.Info("post-unseal trace completed", "file", traceFile) + } +} + // preSeal is invoked before the barrier is sealed, allowing // for any state teardown required. func (c *Core) preSeal() error { diff --git a/vault/external_tests/pprof/pprof_test.go b/vault/external_tests/pprof/pprof_test.go index dff92030122a..c0b5c95dd040 100644 --- a/vault/external_tests/pprof/pprof_test.go +++ b/vault/external_tests/pprof/pprof_test.go @@ -7,20 +7,33 @@ import ( "encoding/json" "io" "net/http" + "os" "strconv" "strings" "testing" "github.com/hashicorp/go-cleanhttp" + "github.com/hashicorp/vault/command/server" vaulthttp "github.com/hashicorp/vault/http" "github.com/hashicorp/vault/sdk/helper/testhelpers/schema" "github.com/hashicorp/vault/vault" + "github.com/stretchr/testify/require" "golang.org/x/net/http2" ) func TestSysPprof(t *testing.T) { t.Parallel() - cluster := vault.NewTestCluster(t, nil, &vault.TestClusterOptions{ + + // trace test setup + dir, err := os.MkdirTemp("", "vault-trace-test") + require.NoError(t, err) + + cluster := vault.NewTestCluster(t, &vault.CoreConfig{ + RawConfig: &server.Config{ + EnablePostUnsealTrace: true, + PostUnsealTraceDir: dir, + }, + }, &vault.TestClusterOptions{ HandlerFunc: vaulthttp.Handler, RequestResponseCallback: schema.ResponseValidatingCallback(t), }) @@ -30,6 +43,14 @@ func TestSysPprof(t *testing.T) { core := cluster.Cores[0].Core vault.TestWaitActive(t, core) SysPprof_Test(t, cluster) + + // draft trace test onto pprof one to avoid increasing test runtime with additional clusters + files, err := os.ReadDir(dir) + require.NoError(t, err) + require.Greater(t, len(files), 0) + traceFile, err := files[0].Info() + require.NoError(t, err) + require.Greater(t, traceFile.Size(), int64(0)) } func TestSysPprof_MaxRequestDuration(t *testing.T) { diff --git a/website/content/docs/configuration/index.mdx b/website/content/docs/configuration/index.mdx index b32a6665d15f..edf1907a2c7c 100644 --- a/website/content/docs/configuration/index.mdx +++ b/website/content/docs/configuration/index.mdx @@ -260,6 +260,17 @@ can have a negative effect on performance due to the tracking of each lock attem When `imprecise_lease_role_tracking` is set to true and a new role-based quota is enabled, subsequent lease counts start from 0. `imprecise_lease_role_tracking` affects role-based lease count quotas, but reduces latencies when not using role based quotas. +- `enable_post_unseal_trace` `(bool: false)` - Enables the server to generate a Go trace during the execution of the + `core.postUnseal` function for debug purposes. The resulting trace can be viewed with the `go tool trace` command. The output + directory can be specified with the `post_unseal_trace_directory` parameter. This should only be enabled temporarily for + debugging purposes as it can have a significant performance impact. This can be updated on a running Vault process with a + SIGHUP signal. + +- `post_unseal_trace_directory` `(string: "")` - Specifies the directory where the trace file will be written, which must exist + and be writable by the Vault process. If not specified it will create a subdirectory `vault-traces` under the result from + [os.TempDir()](https://pkg.go.dev/os#TempDir) (usually `/tmp` on Unix systems). This can be updated on a running Vault process + with a SIGHUP signal. + ### High availability parameters The following parameters are used on backends that support [high availability][high-availability].