Skip to content

Commit

Permalink
profiler: goroutineswait saw-stop mechanism (#942)
Browse files Browse the repository at this point in the history
profiler: goroutineswait saw-stop mechanism

This is a safety mechanism for the new goroutineWaitProfile that
automatically disables the profile if the number of goroutines is too
high in order to avoid excessive stop-the-world pauses.

The default limit is 1000 which is somewhat arbitrary and should limit
STW pauses to ~30ms. It can be overwritten with
DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES.

WARNING: The goroutineWaitProfile is still experimental and not meant to
be used by users outside of datadog.

Fixes PROF-3234
  • Loading branch information
felixge authored Jun 30, 2021
1 parent 37b76b6 commit 1389fb2
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 26 deletions.
62 changes: 36 additions & 26 deletions profiler/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"unicode"
Expand Down Expand Up @@ -82,21 +83,22 @@ type config struct {
agentless bool
// targetURL is the upload destination URL. It will be set by the profiler on start to either apiURL or agentURL
// based on the other options.
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
mutexFraction int
blockRate int
outputDir string
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
maxGoroutinesWait int
mutexFraction int
blockRate int
outputDir string
}

func urlForSite(site string) (string, error) {
Expand Down Expand Up @@ -127,17 +129,18 @@ func (c *config) addProfileType(t ProfileType) {

func defaultConfig() (*config, error) {
c := config{
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
maxGoroutinesWait: 1000, // arbitrary value, should limit STW to ~30ms
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
}
for _, t := range defaultProfileTypes {
c.addProfileType(t)
Expand Down Expand Up @@ -206,6 +209,13 @@ func defaultConfig() (*config, error) {
if v := os.Getenv("DD_PROFILING_OUTPUT_DIR"); v != "" {
withOutputDir(v)(&c)
}
if v := os.Getenv("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"); v != "" {
n, err := strconv.Atoi(v)
if err != nil {
return nil, fmt.Errorf("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES: %s", err)
}
c.maxGoroutinesWait = n
}
return &c, nil
}

Expand Down
5 changes: 5 additions & 0 deletions profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"errors"
"fmt"
"io"
"runtime"
"runtime/pprof"
"time"

Expand Down Expand Up @@ -231,6 +232,10 @@ func goroutineProfile(cfg *config) (*profile, error) {
}

func goroutineWaitProfile(cfg *config) (*profile, error) {
if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait {
return nil, fmt.Errorf("skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait)
}

var (
text = &bytes.Buffer{}
pprof = &bytes.Buffer{}
Expand Down
51 changes: 51 additions & 0 deletions profiler/profile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ package profiler

import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"strconv"
"testing"
"time"

Expand Down Expand Up @@ -166,6 +169,54 @@ main.main()
"main.indirectShortSleepLoop2",
})
})

t.Run("goroutineswaitLimit", func(t *testing.T) {
// spawGoroutines spawns n goroutines, waits for them to start executing,
// and then returns a func to stop them. For more details about `executing`
// see:
// https://github.com/DataDog/dd-trace-go/pull/942#discussion_r656924335
spawnGoroutines := func(n int) func() {
executing := make(chan struct{})
stopping := make(chan struct{})
for i := 0; i < n; i++ {
go func() {
executing <- struct{}{}
stopping <- struct{}{}
}()
<-executing
}
return func() {
for i := 0; i < n; i++ {
<-stopping
}
}
}

goroutines := 100
limit := 10

stop := spawnGoroutines(goroutines)
defer stop()
envVar := "DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"
oldVal := os.Getenv(envVar)
os.Setenv(envVar, strconv.Itoa(limit))
defer os.Setenv(envVar, oldVal)

defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile)
lookupProfile = func(_ string, w io.Writer, _ int) error {
_, err := w.Write([]byte(""))
return err
}

p, err := unstartedProfiler()
require.NoError(t, err)
_, err = p.runProfile(expGoroutineWaitProfile)
var errRoutines, errLimit int
msg := "skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d"
fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit)
require.GreaterOrEqual(t, errRoutines, goroutines)
require.Equal(t, limit, errLimit)
})
}

func Test_goroutineDebug2ToPprof_CrashSafety(t *testing.T) {
Expand Down

0 comments on commit 1389fb2

Please sign in to comment.