Skip to content

Commit

Permalink
pprofui: add support for collecting goroutines with labels
Browse files Browse the repository at this point in the history
This change refactors some of the logic in the pprofui/.
Additionally, we add support for collecting cluster-wide
goroutine profiles.

These goroutine profiles can be collected with or without labels.
When collected without labels we request pprof profiles
with debug=0, which generates Profile protobufs on every
node and `pprof.Merge`s them before redirecting the client
to a flamegraph.

When collected with labels we request pprof profiles with
debug=1, which generates a legacy text format with comments
translating addresses to function names and line numbers,
so that a programmer can read the profile without tools.
These profiles cannot be `pprof.Merged` and so we manually
stitch them together on a per-node basis before downloading
them for the client as txt file.

This change also adds support for filtering the aforementioned
goroutines by pprof label. This will be used by the Jobs Profiler
to collect cluster-wide stacks relevant to the running job.

Informs: #105440
Release note: None
  • Loading branch information
adityamaru committed Jul 1, 2023
1 parent 2b66f95 commit c618d44
Show file tree
Hide file tree
Showing 10 changed files with 517 additions and 196 deletions.
5 changes: 3 additions & 2 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -3043,10 +3043,11 @@ Support status: [reserved](#support-status)

| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_id | [string](#cockroach.server.serverpb.ProfileRequest-string) | | node_id is a string so that "local" can be used to specify that no forwarding is necessary. node_id translates to a KV node ID on a storage server and SQL instance ID on a SQL only server. | [reserved](#support-status) |
| node_id | [string](#cockroach.server.serverpb.ProfileRequest-string) | | node_id is a string so that "local" or "all" can be used to specify that no forwarding is necessary. node_id translates to a KV node ID on a storage server and SQL instance ID on a SQL only server. | [reserved](#support-status) |
| type | [ProfileRequest.Type](#cockroach.server.serverpb.ProfileRequest-cockroach.server.serverpb.ProfileRequest.Type) | | The type of profile to retrieve. | [reserved](#support-status) |
| seconds | [int32](#cockroach.server.serverpb.ProfileRequest-int32) | | applies only to Type=CPU, defaults to 30 | [reserved](#support-status) |
| labels | [bool](#cockroach.server.serverpb.ProfileRequest-bool) | | applies only to Type=CPU, defaults to false | [reserved](#support-status) |
| with_labels | [bool](#cockroach.server.serverpb.ProfileRequest-bool) | | WithLabels can be specified for Type=CPU or Type=GOROUTINE.<br><br>- If true for CPU profiles, we request a CPU profile with pprof labels.<br><br>- If true for GOROUTINE profiles, we request an aggregated goroutine profile with debug=1. | [reserved](#support-status) |
| label_filter | [string](#cockroach.server.serverpb.ProfileRequest-string) | | LabelFilter only applies to Type=GOROUTINE. Only goroutines with a pprof label matching the filter will be returned. | [reserved](#support-status) |
| sender_server_version | [cockroach.roachpb.Version](#cockroach.server.serverpb.ProfileRequest-cockroach.roachpb.Version) | | SenderServerVersion is the server version of the node sending the Profile request. If this field is set then the node processing the request will only collect the profile if its server version is equal to the sender's server version.<br><br>Currently, this is only used when collecting profiles that will be merged using pprof.Merge as all the samples must be from the same binary version. | [reserved](#support-status) |


Expand Down
8 changes: 4 additions & 4 deletions pkg/cli/zip_per_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ func (zc *debugZipContext) collectCPUProfiles(
var pd profData
err := timeutil.RunWithTimeout(ctx, "fetch cpu profile", zc.timeout+zipCtx.cpuProfDuration, func(ctx context.Context) error {
resp, err := zc.status.Profile(ctx, &serverpb.ProfileRequest{
NodeId: fmt.Sprintf("%d", nodeID),
Type: serverpb.ProfileRequest_CPU,
Seconds: secs,
Labels: true,
NodeId: fmt.Sprintf("%d", nodeID),
Type: serverpb.ProfileRequest_CPU,
Seconds: secs,
WithLabels: true,
})
if err != nil {
return err
Expand Down
3 changes: 2 additions & 1 deletion pkg/server/debug/pprofui/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ go_test(
srcs = ["server_test.go"],
args = ["-test.timeout=55s"],
data = glob(["testdata/**"]),
embed = [":pprofui"],
deps = [
":pprofui",
"//pkg/build/bazel",
"//pkg/server",
"//pkg/server/serverpb",
"//pkg/testutils/datapathutils",
"//pkg/testutils/skip",
Expand Down
296 changes: 182 additions & 114 deletions pkg/server/debug/pprofui/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,162 @@ func (s *Server) parsePath(reqPath string) (profType string, id string, remainin
}
}

// validateProfileRequest validates that the request does not contian any
// conflicting or invalid configurations.
func validateProfileRequest(req *serverpb.ProfileRequest) error {
switch req.Type {
case serverpb.ProfileRequest_GOROUTINE:
case serverpb.ProfileRequest_CPU:
if req.LabelFilter != "" {
return errors.Newf("filtering using a pprof label is unsupported for %s", req.Type.String())
}
default:
if req.NodeId == "all" {
return errors.Newf("cluster-wide collection is unsupported for %s", req.Type.String())
}

if req.WithLabels {
return errors.Newf("profiling with labels is unsupported for %s", req.Type.String())
}

if req.LabelFilter != "" {
return errors.Newf("filtering using a pprof label is unsupported for %s", req.Type.String())
}
}

return nil
}

func constructProfileReq(r *http.Request, profileName string) (*serverpb.ProfileRequest, error) {
req, err := http.NewRequest("GET", "/unused", bytes.NewReader(nil))
if err != nil {
return nil, err
}

profileType, ok := serverpb.ProfileRequest_Type_value[strings.ToUpper(profileName)]
if !ok && profileName != "profile" {
return nil, errors.Newf("unknown profile name: %s", profileName)
}
// There is a discrepancy between the usage of the "CPU" and
// "profile" names to refer to the CPU profile in the
// implementations. The URL to the CPU profile has been modified
// on the Advanced Debug page to point to /pprof/ui/cpu but this
// is retained for backwards compatibility.
if profileName == "profile" {
profileType = int32(serverpb.ProfileRequest_CPU)
}
profileReq := &serverpb.ProfileRequest{
NodeId: "local",
Type: serverpb.ProfileRequest_Type(profileType),
}

// Pass through any parameters. Most notably, allow ?seconds=10 for
// CPU profiles.
_ = r.ParseForm()
req.Form = r.Form

if r.Form.Get("seconds") != "" {
sec, err := strconv.ParseInt(r.Form.Get("seconds"), 10, 32)
if err != nil {
return nil, err
}
profileReq.Seconds = int32(sec)
}
if r.Form.Get("node") != "" {
profileReq.NodeId = r.Form.Get("node")
}
if r.Form.Get("labels") != "" {
withLabels, err := strconv.ParseBool(r.Form.Get("labels"))
if err != nil {
return nil, err
}
profileReq.WithLabels = withLabels
}
if labelFilter := r.Form.Get("labelfilter"); labelFilter != "" {
profileReq.WithLabels = true
profileReq.LabelFilter = labelFilter
}
if err := validateProfileRequest(profileReq); err != nil {
return nil, err
}
return profileReq, nil
}

func (s *Server) serveCachedProfile(
id string, remainingPath string, profileName string, w http.ResponseWriter, r *http.Request,
) {
// Catch nonexistent IDs early or pprof will do a worse job at
// giving an informative error.
var isProfileProto bool
if err := s.storage.Get(id, func(b bool, _ io.Reader) error {
isProfileProto = b
return nil
}); err != nil {
msg := fmt.Sprintf("profile for id %s not found: %s", id, err)
http.Error(w, msg, http.StatusNotFound)
return
}

// If the profile is not in a protobuf format, downloading it is the only
// option.
if !isProfileProto || r.URL.Query().Get("download") != "" {
// TODO(tbg): this has zero discoverability.
w.Header().Set("Content-Disposition",
fmt.Sprintf("attachment; filename=%s_%s.pb.gz", profileName, id))
w.Header().Set("Content-Type", "application/octet-stream")
if err := s.storage.Get(id, func(_ bool, r io.Reader) error {
_, err := io.Copy(w, r)
return err
}); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
return
}

server := func(args *driver.HTTPServerArgs) error {
handler, ok := args.Handlers[remainingPath]
if !ok {
return errors.Errorf("unknown endpoint %s", remainingPath)
}
handler.ServeHTTP(w, r)
return nil
}

storageFetcher := func(_ string, _, _ time.Duration) (*profile.Profile, string, error) {
var p *profile.Profile
if err := s.storage.Get(id, func(_ bool, reader io.Reader) error {
var err error
p, err = profile.Parse(reader)
return err
}); err != nil {
return nil, "", err
}
return p, "", nil
}

// Invoke the (library version) of `pprof` with a number of stubs.
// Specifically, we pass a fake FlagSet that plumbs through the
// given args, a UI that logs any errors pprof may emit, a fetcher
// that simply reads the profile we downloaded earlier, and a
// HTTPServer that pprof will pass the web ui handlers to at the
// end (and we let it handle this client request).
if err := driver.PProf(&driver.Options{
Flagset: &pprofFlags{
FlagSet: pflag.NewFlagSet("pprof", pflag.ExitOnError),
args: []string{
"--symbolize", "none",
"--http", "localhost:0",
"", // we inject our own target
},
},
UI: &fakeUI{},
Fetch: fetcherFn(storageFetcher),
HTTPServer: server,
}); err != nil {
_, _ = w.Write([]byte(err.Error()))
}
}

func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
profileName, id, remainingPath := s.parsePath(r.URL.Path)

Expand All @@ -113,124 +269,24 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}

if id != "" {
// Catch nonexistent IDs early or pprof will do a worse job at
// giving an informative error.
if err := s.storage.Get(id, func(io.Reader) error { return nil }); err != nil {
msg := fmt.Sprintf("profile for id %s not found: %s", id, err)
http.Error(w, msg, http.StatusNotFound)
return
}

if r.URL.Query().Get("download") != "" {
// TODO(tbg): this has zero discoverability.
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s_%s.pb.gz", profileName, id))
w.Header().Set("Content-Type", "application/octet-stream")
if err := s.storage.Get(id, func(r io.Reader) error {
_, err := io.Copy(w, r)
return err
}); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
return
}

server := func(args *driver.HTTPServerArgs) error {
handler, ok := args.Handlers[remainingPath]
if !ok {
return errors.Errorf("unknown endpoint %s", remainingPath)
}
handler.ServeHTTP(w, r)
return nil
}

storageFetcher := func(_ string, _, _ time.Duration) (*profile.Profile, string, error) {
var p *profile.Profile
if err := s.storage.Get(id, func(reader io.Reader) error {
var err error
p, err = profile.Parse(reader)
return err
}); err != nil {
return nil, "", err
}
return p, "", nil
}

// Invoke the (library version) of `pprof` with a number of stubs.
// Specifically, we pass a fake FlagSet that plumbs through the
// given args, a UI that logs any errors pprof may emit, a fetcher
// that simply reads the profile we downloaded earlier, and a
// HTTPServer that pprof will pass the web ui handlers to at the
// end (and we let it handle this client request).
if err := driver.PProf(&driver.Options{
Flagset: &pprofFlags{
FlagSet: pflag.NewFlagSet("pprof", pflag.ExitOnError),
args: []string{
"--symbolize", "none",
"--http", "localhost:0",
"", // we inject our own target
},
},
UI: &fakeUI{},
Fetch: fetcherFn(storageFetcher),
HTTPServer: server,
}); err != nil {
_, _ = w.Write([]byte(err.Error()))
}

s.serveCachedProfile(id, remainingPath, profileName, w, r)
return
}

// Create and save new profile, then redirect client to corresponding ui URL.

id = s.storage.ID()

if err := s.storage.Store(id, func(w io.Writer) error {
req, err := http.NewRequest("GET", "/unused", bytes.NewReader(nil))
if err != nil {
return err
}

profileType, ok := serverpb.ProfileRequest_Type_value[strings.ToUpper(profileName)]
if !ok && profileName != "profile" {
return errors.Newf("unknown profile name: %s", profileName)
}
// There is a discrepancy between the usage of the "CPU" and
// "profile" names to refer to the CPU profile in the
// implementations. The URL to the CPU profile has been modified
// on the Advanced Debug page to point to /pprof/ui/cpu but this
// is retained for backwards compatibility.
if profileName == "profile" {
profileType = int32(serverpb.ProfileRequest_CPU)
}
var resp *serverpb.JSONResponse
profileReq := &serverpb.ProfileRequest{
NodeId: "local",
Type: serverpb.ProfileRequest_Type(profileType),
}

// Pass through any parameters. Most notably, allow ?seconds=10 for
// CPU profiles.
_ = r.ParseForm()
req.Form = r.Form

if r.Form.Get("seconds") != "" {
sec, err := strconv.ParseInt(r.Form.Get("seconds"), 10, 32)
if err != nil {
return err
}
profileReq.Seconds = int32(sec)
}
if r.Form.Get("node") != "" {
profileReq.NodeId = r.Form.Get("node")
}
if r.Form.Get("labels") != "" {
labels, err := strconv.ParseBool(r.Form.Get("labels"))
if err != nil {
return err
}
profileReq.Labels = labels
}
resp, err = s.profiler.Profile(r.Context(), profileReq)
profileReq, err := constructProfileReq(r, profileName)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// A GOROUTINE profile with labels is not generated in a protobuf format. It
// is outputted in a "legacy text format with comments translating addresses
// to function names and line numbers, so that a programmer can read the
// profile without tools."
isProfileProto := !(profileReq.Type == serverpb.ProfileRequest_GOROUTINE && profileReq.WithLabels)
if err := s.storage.Store(id, isProfileProto, func(w io.Writer) error {
resp, err := s.profiler.Profile(r.Context(), profileReq)
if err != nil {
return err
}
Expand All @@ -244,6 +300,18 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}

// If the profile is not in a protobuf format, downloading it is the only
// option.
if !isProfileProto {
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s_%s.txt", profileName, id))
w.Header().Set("Content-Type", "text/plain")
_ = s.storage.Get(id, func(isProtoProfile bool, r io.Reader) error {
_, err := io.Copy(w, r)
return err
})
return
}

// NB: direct straight to the flamegraph. This is because `pprof`
// shells out to `dot` for the default landing page and thus works
// only on hosts that have graphviz installed. You can still navigate
Expand All @@ -263,7 +331,7 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if !isGoPProf {
http.Redirect(w, r, origURL.String(), http.StatusTemporaryRedirect)
} else {
_ = s.storage.Get(id, func(r io.Reader) error {
_ = s.storage.Get(id, func(isProtoProfile bool, r io.Reader) error {
_, err := io.Copy(w, r)
return err
})
Expand Down
Loading

0 comments on commit c618d44

Please sign in to comment.