From 1389fb2d687e5577702c589d0a0db68b335efe2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Wed, 30 Jun 2021 10:37:30 +0200 Subject: [PATCH] profiler: goroutineswait saw-stop mechanism (#942) profiler: goroutineswait saw-stop mechanism This is a safety mechanism for the new goroutineWaitProfile that automatically disables the profile if the number of goroutines is too high in order to avoid excessive stop-the-world pauses. The default limit is 1000 which is somewhat arbitrary and should limit STW pauses to ~30ms. It can be overwritten with DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES. WARNING: The goroutineWaitProfile is still experimental and not meant to be used by users outside of datadog. Fixes PROF-3234 --- profiler/options.go | 62 +++++++++++++++++++++++----------------- profiler/profile.go | 5 ++++ profiler/profile_test.go | 51 +++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 26 deletions(-) diff --git a/profiler/options.go b/profiler/options.go index ef6fcf3da5..2092c41304 100644 --- a/profiler/options.go +++ b/profiler/options.go @@ -14,6 +14,7 @@ import ( "os" "path/filepath" "runtime" + "strconv" "strings" "time" "unicode" @@ -82,21 +83,22 @@ type config struct { agentless bool // targetURL is the upload destination URL. It will be set by the profiler on start to either apiURL or agentURL // based on the other options. - targetURL string - apiURL string // apiURL is the Datadog intake API URL - agentURL string // agentURL is the Datadog agent profiling URL - service, env string - hostname string - statsd StatsdClient - httpClient *http.Client - tags []string - types map[ProfileType]struct{} - period time.Duration - cpuDuration time.Duration - uploadTimeout time.Duration - mutexFraction int - blockRate int - outputDir string + targetURL string + apiURL string // apiURL is the Datadog intake API URL + agentURL string // agentURL is the Datadog agent profiling URL + service, env string + hostname string + statsd StatsdClient + httpClient *http.Client + tags []string + types map[ProfileType]struct{} + period time.Duration + cpuDuration time.Duration + uploadTimeout time.Duration + maxGoroutinesWait int + mutexFraction int + blockRate int + outputDir string } func urlForSite(site string) (string, error) { @@ -127,17 +129,18 @@ func (c *config) addProfileType(t ProfileType) { func defaultConfig() (*config, error) { c := config{ - env: defaultEnv, - apiURL: defaultAPIURL, - service: filepath.Base(os.Args[0]), - statsd: &statsd.NoOpClient{}, - httpClient: defaultClient, - period: DefaultPeriod, - cpuDuration: DefaultDuration, - blockRate: DefaultBlockRate, - mutexFraction: DefaultMutexFraction, - uploadTimeout: DefaultUploadTimeout, - tags: []string{fmt.Sprintf("pid:%d", os.Getpid())}, + env: defaultEnv, + apiURL: defaultAPIURL, + service: filepath.Base(os.Args[0]), + statsd: &statsd.NoOpClient{}, + httpClient: defaultClient, + period: DefaultPeriod, + cpuDuration: DefaultDuration, + blockRate: DefaultBlockRate, + mutexFraction: DefaultMutexFraction, + uploadTimeout: DefaultUploadTimeout, + maxGoroutinesWait: 1000, // arbitrary value, should limit STW to ~30ms + tags: []string{fmt.Sprintf("pid:%d", os.Getpid())}, } for _, t := range defaultProfileTypes { c.addProfileType(t) @@ -206,6 +209,13 @@ func defaultConfig() (*config, error) { if v := os.Getenv("DD_PROFILING_OUTPUT_DIR"); v != "" { withOutputDir(v)(&c) } + if v := os.Getenv("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return nil, fmt.Errorf("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES: %s", err) + } + c.maxGoroutinesWait = n + } return &c, nil } diff --git a/profiler/profile.go b/profiler/profile.go index f1eb098150..c58a680aab 100644 --- a/profiler/profile.go +++ b/profiler/profile.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "runtime" "runtime/pprof" "time" @@ -231,6 +232,10 @@ func goroutineProfile(cfg *config) (*profile, error) { } func goroutineWaitProfile(cfg *config) (*profile, error) { + if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait { + return nil, fmt.Errorf("skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait) + } + var ( text = &bytes.Buffer{} pprof = &bytes.Buffer{} diff --git a/profiler/profile_test.go b/profiler/profile_test.go index 579375fa58..74174f59f0 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -7,8 +7,11 @@ package profiler import ( "bytes" + "fmt" "io" "io/ioutil" + "os" + "strconv" "testing" "time" @@ -166,6 +169,54 @@ main.main() "main.indirectShortSleepLoop2", }) }) + + t.Run("goroutineswaitLimit", func(t *testing.T) { + // spawGoroutines spawns n goroutines, waits for them to start executing, + // and then returns a func to stop them. For more details about `executing` + // see: + // https://github.com/DataDog/dd-trace-go/pull/942#discussion_r656924335 + spawnGoroutines := func(n int) func() { + executing := make(chan struct{}) + stopping := make(chan struct{}) + for i := 0; i < n; i++ { + go func() { + executing <- struct{}{} + stopping <- struct{}{} + }() + <-executing + } + return func() { + for i := 0; i < n; i++ { + <-stopping + } + } + } + + goroutines := 100 + limit := 10 + + stop := spawnGoroutines(goroutines) + defer stop() + envVar := "DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES" + oldVal := os.Getenv(envVar) + os.Setenv(envVar, strconv.Itoa(limit)) + defer os.Setenv(envVar, oldVal) + + defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile) + lookupProfile = func(_ string, w io.Writer, _ int) error { + _, err := w.Write([]byte("")) + return err + } + + p, err := unstartedProfiler() + require.NoError(t, err) + _, err = p.runProfile(expGoroutineWaitProfile) + var errRoutines, errLimit int + msg := "skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d" + fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit) + require.GreaterOrEqual(t, errRoutines, goroutines) + require.Equal(t, limit, errLimit) + }) } func Test_goroutineDebug2ToPprof_CrashSafety(t *testing.T) {