From f72ab1157a34d6e1cb1d777f657798799da767ce Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 21 Dec 2021 19:32:52 -0500 Subject: [PATCH] tracing: add /debug/tracez rendering the active spans /debug/tracez lets users take a snapshot of the active spans registry and render the new snapshot, or one of the previously taken snapshots. The Tracer can hold up to 10 snapshots in memory. The PR description has a screenshot. When visualizing a snapshot, the page lets you do a number of things: 1. List all the spans. 2. See the (current) stack trace for each span's goroutine (if the goroutine was still running at the time when the snapshot was captured). Stack traces can be toggled visible/hidden. 3. Sort the spans by name or start time. 4. Filter the span according to text search. The search works across the name and stack trace. 5. See the full trace that a particular span is part of. For the table Javascript providing sorting and filtering, this patch embeds the library from https://listjs.com/ . Limitations: - for now, only the registry of the local node is snapshotted. In the fuiture I'll collect info from all nodes. - for now, the relationships between different spans are not represented in any way. I'll work on the ability to go from a span to the whole trace that the span is part of. - for now, tags and structured and unstructured log messages that a span might have are not displayed in any way. At the moment, span creation is not enabled in production by default (i.e. the Tracer is put in TracingModeOnDemand by default, instead of the required TracingModeActiveSpansRegistry). This patch does not change that, so in order to benefit from /debug/tracez in all its glory, one has to run with COCKROACH_REAL_SPANS=1 for now. Not for long, though. Release note: None --- pkg/server/debug/BUILD.bazel | 2 + pkg/server/debug/server.go | 27 +- pkg/server/server.go | 3 +- pkg/server/tenant.go | 2 +- pkg/ui/BUILD.bazel | 18 +- pkg/ui/templates/tracing/html_template.html | 140 +++++++++ pkg/ui/tracing_ui.go | 36 +++ pkg/ui/workspaces/db-console/package.json | 1 + .../views/reports/containers/debug/index.tsx | 1 + pkg/ui/yarn-vendor | 2 +- pkg/ui/yarn.lock | 12 + pkg/util/tracing/BUILD.bazel | 1 + pkg/util/tracing/tracer.go | 45 ++- pkg/util/tracing/tracer_snapshots.go | 195 ++++++++++++ pkg/util/tracing/tracingpb/recorded_span.go | 19 ++ pkg/util/tracing/tracingui/BUILD.bazel | 17 ++ .../tracing/tracingui/span_registry_ui.go | 277 ++++++++++++++++++ 17 files changed, 774 insertions(+), 24 deletions(-) create mode 100644 pkg/ui/templates/tracing/html_template.html create mode 100644 pkg/ui/tracing_ui.go create mode 100644 pkg/util/tracing/tracer_snapshots.go create mode 100644 pkg/util/tracing/tracingui/BUILD.bazel create mode 100644 pkg/util/tracing/tracingui/span_registry_ui.go diff --git a/pkg/server/debug/BUILD.bazel b/pkg/server/debug/BUILD.bazel index 6f731cd5e7a5..38120156700d 100644 --- a/pkg/server/debug/BUILD.bazel +++ b/pkg/server/debug/BUILD.bazel @@ -29,6 +29,8 @@ go_library( "//pkg/util/log/severity", "//pkg/util/stop", "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingui", "//pkg/util/uint128", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_pebble//tool", diff --git a/pkg/server/debug/server.go b/pkg/server/debug/server.go index 788b99b47b3c..15cdef61d07d 100644 --- a/pkg/server/debug/server.go +++ b/pkg/server/debug/server.go @@ -32,6 +32,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui" "github.com/cockroachdb/errors" pebbletool "github.com/cockroachdb/pebble/tool" metrics "github.com/rcrowley/go-metrics" @@ -61,14 +63,18 @@ var _ = func() *settings.StringSetting { // Server serves the /debug/* family of tools. type Server struct { - st *cluster.Settings - mux *http.ServeMux - spy logSpy + ambientCtx log.AmbientContext + st *cluster.Settings + mux *http.ServeMux + spy logSpy } // NewServer sets up a debug server. func NewServer( - st *cluster.Settings, hbaConfDebugFn http.HandlerFunc, profiler pprofui.Profiler, + ambientContext log.AmbientContext, + st *cluster.Settings, + hbaConfDebugFn http.HandlerFunc, + profiler pprofui.Profiler, ) *Server { mux := http.NewServeMux() @@ -137,9 +143,10 @@ func NewServer( }) return &Server{ - st: st, - mux: mux, - spy: spy, + ambientCtx: ambientContext, + st: st, + mux: mux, + spy: spy, } } @@ -232,6 +239,12 @@ func (ds *Server) RegisterClosedTimestampSideTransport( }) } +// RegisterTracez registers the /debug/tracez handler, which renders snapshots +// of active spans. +func (ds *Server) RegisterTracez(tr *tracing.Tracer) { + tracingui.RegisterHTTPHandlers(ds.ambientCtx, ds.mux, tr) +} + // ServeHTTP serves various tools under the /debug endpoint. func (ds *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { handler, _ := ds.mux.Handler(r) diff --git a/pkg/server/server.go b/pkg/server/server.go index 66da809a3230..320e5221783c 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -829,7 +829,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { } sStatus.setStmtDiagnosticsRequester(sqlServer.execCfg.StmtDiagnosticsRecorder) sStatus.baseStatusServer.sqlServer = sqlServer - debugServer := debug.NewServer(st, sqlServer.pgServer.HBADebugFn(), sStatus) + debugServer := debug.NewServer(cfg.BaseConfig.AmbientCtx, st, sqlServer.pgServer.HBADebugFn(), sStatus) node.InitLogger(sqlServer.execCfg) *lateBoundServer = Server{ @@ -1929,6 +1929,7 @@ func (s *Server) PreStart(ctx context.Context) error { return errors.Wrapf(err, "failed to register engines with debug server") } s.debug.RegisterClosedTimestampSideTransport(s.ctSender, s.node.storeCfg.ClosedTimestampReceiver) + s.debug.RegisterTracez(s.cfg.Tracer) s.ctSender.Run(ctx, state.nodeID) diff --git a/pkg/server/tenant.go b/pkg/server/tenant.go index 06cc4b4446b2..343d74984233 100644 --- a/pkg/server/tenant.go +++ b/pkg/server/tenant.go @@ -211,7 +211,7 @@ func StartTenant( ) mux := http.NewServeMux() - debugServer := debug.NewServer(args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) + debugServer := debug.NewServer(baseCfg.AmbientCtx, args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) mux.Handle("/", debugServer) mux.Handle("/_status/", gwMux) mux.HandleFunc("/health", func(w http.ResponseWriter, req *http.Request) { diff --git a/pkg/ui/BUILD.bazel b/pkg/ui/BUILD.bazel index 862ffdf0aed6..5921a0292aa2 100644 --- a/pkg/ui/BUILD.bazel +++ b/pkg/ui/BUILD.bazel @@ -1,8 +1,16 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@build_bazel_rules_nodejs//:index.bzl", "nodejs_binary") go_library( name = "ui", - srcs = ["ui.go"], + srcs = [ + "tracing_ui.go", + "ui.go", + ], + embedsrcs = [ + "node_modules/list.js/dist/list.min.js", + "templates/tracing/html_template.html", + ], importpath = "github.com/cockroachdb/cockroach/pkg/ui", visibility = ["//visibility:public"], deps = [ @@ -35,3 +43,11 @@ EOF """, visibility = ["//pkg/ui:__subpackages__"], ) + +genrule( + name = "listjs", + srcs = ["@npm//:node_modules/list.js/dist/list.min.js"], + outs = ["node_modules/list.js/dist/list.min.js"], + cmd = "cp ./$(location @npm//:node_modules/list.js/dist/list.min.js) $@", + tools = ["@npm//list.js"], +) diff --git a/pkg/ui/templates/tracing/html_template.html b/pkg/ui/templates/tracing/html_template.html new file mode 100644 index 000000000000..f66b7a4d6ad0 --- /dev/null +++ b/pkg/ui/templates/tracing/html_template.html @@ -0,0 +1,140 @@ + + + + + + + + + +Take a snapshot of current operations +
+Stored snapshots (ID: capture time): +{{$id := .SnapshotID}} +{{range $i, $s := .AllSnapshots}} + + {{if eq $s.ID $id}} + [current] {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{else}} + {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{end}} + +{{end}} +
+ +

Spans currently open: {{len .SpansList.Spans}}. Snapshot captured at: {{formatTime .CapturedAt}} UTC. Page generated at: {{formatTime .Now}} UTC.

+{{if ne .Err nil}} +

There was an error producing this snapshot; it might be incomplete: {{.Err}}

+{{end}} + +
+ + + + + + + + + + + + +
+ + + + + +
+
+ + + diff --git a/pkg/ui/tracing_ui.go b/pkg/ui/tracing_ui.go new file mode 100644 index 000000000000..61e4edf5b9aa --- /dev/null +++ b/pkg/ui/tracing_ui.go @@ -0,0 +1,36 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package ui + +import ( + "embed" + "io/fs" +) + +// This file deals with embedding assets used by /debug/tracez. + +//go:embed node_modules/list.js/dist/list.min.js +var listJS embed.FS + +//go:embed templates/tracing/html_template.html +// SpansTableTemplateSrc contains a template used by /debug/tracez +var SpansTableTemplateSrc string + +// ListJS exposes list.js package +var ListJS fs.FS + +func init() { + f, err := fs.Sub(listJS, "node_modules/list.js/dist") + if err != nil { + panic(err) + } + ListJS = f +} diff --git a/pkg/ui/workspaces/db-console/package.json b/pkg/ui/workspaces/db-console/package.json index 1d1f0d06e2af..600d687f10b7 100644 --- a/pkg/ui/workspaces/db-console/package.json +++ b/pkg/ui/workspaces/db-console/package.json @@ -29,6 +29,7 @@ "connected-react-router": "^6.9.1", "create-react-context": "^0.3.0", "highlight.js": "^10.6.0", + "list.js": "^2.3.1", "lodash": "^4.17.21", "long": "^4.0.0", "mini-create-react-context": "^0.3.2", diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx index 4b827609d132..6bcd38d4dc89 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx @@ -330,6 +330,7 @@ export default function Debug() { + ) option was supplied. - activeSpansRegistry *spanRegistry + activeSpansRegistry *SpanRegistry + snapshotsMu struct { + syncutil.Mutex + // snapshots stores the activeSpansRegistry snapshots taken during the + // Tracer's lifetime. The ring buffer will contain snapshots with contiguous + // IDs, from the oldest one to + maxSnapshots - 1. + snapshots ring.Buffer // snapshotWithID + } testingMu syncutil.Mutex // protects testingRecordAsyncSpans testingRecordAsyncSpans bool // see TestingRecordAsyncSpans @@ -263,7 +274,7 @@ type Tracer struct { stack string } -// spanRegistry is a map that references all non-Finish'ed local root spans, +// SpanRegistry is a map that references all non-Finish'ed local root spans, // i.e. those for which no WithLocalParent() option was supplied. The // map is keyed on the span ID, which is deterministically unique. // @@ -272,30 +283,30 @@ type Tracer struct { // // The map can be introspected by `Tracer.VisitSpans`. A Span can also be // retrieved from its ID by `Tracer.GetActiveSpanByID`. -type spanRegistry struct { +type SpanRegistry struct { mu struct { syncutil.Mutex m map[tracingpb.SpanID]*crdbSpan } } -func makeSpanRegistry() *spanRegistry { - r := &spanRegistry{} +func makeSpanRegistry() *SpanRegistry { + r := &SpanRegistry{} r.mu.m = make(map[tracingpb.SpanID]*crdbSpan) return r } -func (r *spanRegistry) removeSpanLocked(id tracingpb.SpanID) { +func (r *SpanRegistry) removeSpanLocked(id tracingpb.SpanID) { delete(r.mu.m, id) } -func (r *spanRegistry) addSpan(s *crdbSpan) { +func (r *SpanRegistry) addSpan(s *crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.addSpanLocked(s) } -func (r *spanRegistry) addSpanLocked(s *crdbSpan) { +func (r *SpanRegistry) addSpanLocked(s *crdbSpan) { // Ensure that the registry does not grow unboundedly in case there is a leak. // When the registry reaches max size, each new span added kicks out an // arbitrary existing span. We rely on map iteration order here to make this @@ -310,7 +321,7 @@ func (r *spanRegistry) addSpanLocked(s *crdbSpan) { } // getSpanByID looks up a span in the registry. Returns nil if not found. -func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { +func (r *SpanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { r.mu.Lock() defer r.mu.Unlock() crdbSpan, ok := r.mu.m[id] @@ -321,7 +332,10 @@ func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { return crdbSpan } -func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { +// VisitSpans calls the visitor callback for every local root span in the +// registry. Iterations stops when the visitor returns an error. If that error +// is iterutils.StopIteration(), then VisitSpans() returns nil. +func (r *SpanRegistry) VisitSpans(visitor func(span RegistrySpan) error) error { r.mu.Lock() sl := make([]*crdbSpan, 0, len(r.mu.m)) for _, sp := range r.mu.m { @@ -343,7 +357,7 @@ func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { // testingAll returns (pointers to) all the spans in the registry, in an // arbitrary order. Since spans can generally finish at any point and use of a // finished span is not permitted, this method is only suitable for tests. -func (r *spanRegistry) testingAll() []*crdbSpan { +func (r *SpanRegistry) testingAll() []*crdbSpan { r.mu.Lock() defer r.mu.Unlock() res := make([]*crdbSpan, 0, len(r.mu.m)) @@ -358,7 +372,7 @@ func (r *spanRegistry) testingAll() []*crdbSpan { // removing the parent from the registry, the children are accessible in the // registry through that parent; if we didn't do this swap when the parent is // removed, the children would not be part of the registry anymore. -func (r *spanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { +func (r *SpanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.removeSpanLocked(parentID) @@ -1138,7 +1152,7 @@ func (t *Tracer) GetActiveSpanByID(spanID tracingpb.SpanID) RegistrySpan { // VisitSpans invokes the visitor with all active Spans. The function will // gracefully exit if the visitor returns iterutil.StopIteration(). func (t *Tracer) VisitSpans(visitor func(span RegistrySpan) error) error { - return t.activeSpansRegistry.visitSpans(visitor) + return t.activeSpansRegistry.VisitSpans(visitor) } // TestingRecordAsyncSpans is a test-only helper that configures @@ -1179,6 +1193,11 @@ func (t *Tracer) PanicOnUseAfterFinish() bool { return t.panicOnUseAfterFinish } +// SpanRegistry exports the registry containing all currently-open spans. +func (t *Tracer) SpanRegistry() *SpanRegistry { + return t.activeSpansRegistry +} + // ForkSpan forks the current span, if any[1]. Forked spans "follow from" the // original, and are typically used to trace operations that may outlive the // parent (think async tasks). See the package-level documentation for more diff --git a/pkg/util/tracing/tracer_snapshots.go b/pkg/util/tracing/tracer_snapshots.go new file mode 100644 index 000000000000..8f4ce784202f --- /dev/null +++ b/pkg/util/tracing/tracer_snapshots.go @@ -0,0 +1,195 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracing + +import ( + "bufio" + "runtime" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" +) + +// SpansSnapshot represents a snapshot of all the open spans at a certain point +// in time. +type SpansSnapshot struct { + // CapturedAt is the time when the snapshot was collected. + CapturedAt time.Time + // Traces contains the collected traces. Each "local route" corresponds to one + // trace. "Detached recording" spans are included in the parent's trace. + Traces []Recording + // Stacks is a map from groutine ID to the goroutine's stack trace. All + // goroutines running at the time when this snapshot is produced are + // represented here. A goroutine referenced by a span's GoroutineID field will + // not be present if it has finished since the respective span was started. + Stacks map[int]string + // Err is set if an error was encountered when producing the snapshot. The + // snapshot will be incomplete in this case. + Err error +} + +// SnapshotID identifies a spans snapshot. The ID can be used to retrieve a +// specific snapshot from the Tracer. +type SnapshotID int +type snapshotWithID struct { + ID SnapshotID + SpansSnapshot +} + +// SaveSnapshot collects a snapshot of the currently open spans (i.e. the +// contents of the Tracer's active spans registry) saves it in-memory in the +// Tracer. The snapshot's ID is returned; the snapshot can be retrieved +// subsequently through t.GetSnapshot(id). +// +// Snapshots also include a dump of all the goroutine stack traces. +func (t *Tracer) SaveSnapshot() SnapshotID { + snap := t.generateSnapshot() + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == maxSnapshots { + snapshots.RemoveFirst() + } + var id SnapshotID + if snapshots.Len() == 0 { + id = 1 + } else { + id = snapshots.GetLast().(snapshotWithID).ID + 1 + } + snapshots.AddLast(snapshotWithID{ + ID: id, + SpansSnapshot: snap, + }) + return id +} + +var errSnapshotTooOld = errors.Newf( + "the requested snapshot is too old and has been deleted. "+ + "Only the last %d snapshots are stored.", maxSnapshots) + +var errSnapshotDoesntExist = errors.New("the requested snapshot doesn't exist") + +// GetSnapshot returns the snapshot with the given ID. If the ID is below the +// minimum stored snapshot, then the requested snapshot must have been +// garbage-collected and errSnapshotTooOld is returned. If the snapshot id is +// beyond the maximum stored ID, errSnapshotDoesntExist is returned. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) GetSnapshot(id SnapshotID) (SpansSnapshot, error) { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == 0 { + return SpansSnapshot{}, errSnapshotDoesntExist + } + minID := snapshots.GetFirst().(snapshotWithID).ID + if id < minID { + return SpansSnapshot{}, errSnapshotTooOld + } + maxID := snapshots.GetLast().(snapshotWithID).ID + if id > maxID { + return SpansSnapshot{}, errSnapshotDoesntExist + } + + return snapshots.Get(int(id - minID)).(snapshotWithID).SpansSnapshot, nil +} + +// SnapshotInfo represents minimal info about a stored snapshot, as returned by +// Tracer.GetSnapshots(). +type SnapshotInfo struct { + ID SnapshotID + CapturedAt time.Time +} + +// GetSnapshots returns info on all stored span snapshots. +func (t *Tracer) GetSnapshots() []SnapshotInfo { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + res := make([]SnapshotInfo, snapshots.Len()) + for i := 0; i < snapshots.Len(); i++ { + s := snapshots.Get(i).(snapshotWithID) + res[i] = SnapshotInfo{ + ID: s.ID, + CapturedAt: s.CapturedAt, + } + } + return res +} + +// generateSnapshot produces a snapshot of all the currently open spans and +// all the current goroutine stacktraces. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) generateSnapshot() SpansSnapshot { + capturedAt := timeutil.Now() + // Collect the traces. + traces := make([]Recording, 0, 1000) + _ = t.SpanRegistry().VisitSpans(func(sp RegistrySpan) error { + rec := sp.GetFullRecording(RecordingVerbose) + traces = append(traces, rec) + return nil + }) + + // Collect and parse the goroutine stack traces. + + // We don't know how big the traces are, so grow a few times if they don't + // fit. Start large, though. + var stacks []byte + for n := 1 << 20; /* 1mb */ n <= (1 << 29); /* 512mb */ n *= 2 { + stacks = make([]byte, n) + nbytes := runtime.Stack(stacks, true /* all */) + if nbytes < len(stacks) { + break + } + } + + splits := strings.Split(string(stacks), "\n\n") + stackMap := make(map[int]string, len(splits)) + var parseErr error + for _, s := range splits { + // Parse the goroutine ID. The first line of each stack is expected to look like: + // goroutine 115 [chan receive]: + scanner := bufio.NewScanner(strings.NewReader(s)) + scanner.Split(bufio.ScanWords) + // Skip the word "goroutine". + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + // Scan the goroutine ID. + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + goroutineID, err := strconv.Atoi(scanner.Text()) + if err != nil { + panic(err) + } + stackMap[goroutineID] = s + } + + return SpansSnapshot{ + CapturedAt: capturedAt, + Traces: traces, + Stacks: stackMap, + Err: parseErr, + } +} diff --git a/pkg/util/tracing/tracingpb/recorded_span.go b/pkg/util/tracing/tracingpb/recorded_span.go index b398962c941d..2bb4862a5104 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.go +++ b/pkg/util/tracing/tracingpb/recorded_span.go @@ -25,6 +25,25 @@ type TraceID uint64 // SpanID is a probabilistically-unique span id. type SpanID uint64 +// Recording represents a group of RecordedSpans rooted at a fixed root span, as +// returned by GetRecording. Spans are sorted by StartTime. +type Recording []RecordedSpan + +// Less implements sort.Interface. +func (r Recording) Less(i, j int) bool { + return r[i].StartTime.Before(r[j].StartTime) +} + +// Swap implements sort.Interface. +func (r Recording) Swap(i, j int) { + r[i], r[j] = r[j], r[i] +} + +// Len implements sort.Interface. +func (r Recording) Len() int { + return len(r) +} + // LogMessageField is the field name used for the log message in a LogRecord. const LogMessageField = "event" diff --git a/pkg/util/tracing/tracingui/BUILD.bazel b/pkg/util/tracing/tracingui/BUILD.bazel new file mode 100644 index 000000000000..b225899b887e --- /dev/null +++ b/pkg/util/tracing/tracingui/BUILD.bazel @@ -0,0 +1,17 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "tracingui", + srcs = ["span_registry_ui.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui", + visibility = ["//visibility:public"], + deps = [ + "//pkg/ui", + "//pkg/util/log", + "//pkg/util/sysutil", + "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingpb", + "@com_github_cockroachdb_errors//:errors", + ], +) diff --git a/pkg/util/tracing/tracingui/span_registry_ui.go b/pkg/util/tracing/tracingui/span_registry_ui.go new file mode 100644 index 000000000000..cbea770469a0 --- /dev/null +++ b/pkg/util/tracing/tracingui/span_registry_ui.go @@ -0,0 +1,277 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracingui + +import ( + "context" + "fmt" + "html/template" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/ui" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/cockroachdb/errors" +) + +// This file deals with producing the /debug/tracez page, which lists +// a snapshot of the spans in the Tracer's active spans registry. + +const dateAndtimeFormat = "2006-01-02 15:04:05.000" +const timeFormat = "15:04:05.000" + +// RegisterHTTPHandlers registers the /debug/tracez handlers, and helpers. +func RegisterHTTPHandlers(ambientCtx log.AmbientContext, mux *http.ServeMux, tr *tracing.Tracer) { + fileServer := http.StripPrefix("/debug/assets/", http.FileServer(http.FS(ui.ListJS))) + mux.HandleFunc("/debug/tracez", + func(w http.ResponseWriter, req *http.Request) { + serveHTTP(ambientCtx.AnnotateCtx(context.Background()), w, req, tr) + }) + mux.HandleFunc("/debug/show-trace", + func(w http.ResponseWriter, req *http.Request) { + serveHTTPTrace(ambientCtx.AnnotateCtx(req.Context()), w, req, tr) + }) + mux.HandleFunc("/debug/assets/list.min.js", fileServer.ServeHTTP) +} + +type pageData struct { + SnapshotID tracing.SnapshotID + Now time.Time + CapturedAt time.Time + Err error + AllSnapshots []tracing.SnapshotInfo + SpansList spansList +} + +type spansList struct { + Spans []tracingpb.RecordedSpan + // Stacks contains stack traces for the goroutines referenced by the Spans + // through their GoroutineID field. + Stacks map[int]string // GoroutineID to stack trace +} + +var spansTableTemplate *template.Template + +func init() { + spansTableTemplate = template.Must(template.New("spans-list").Funcs( + template.FuncMap{ + "formatTime": formatTime, + "formatTimeNoMillis": formatTimeNoMillis, + "since": func(t time.Time, capturedAt time.Time) string { + return fmt.Sprintf("(%s ago)", formatDuration(capturedAt.Sub(t))) + }, + "timeRaw": func(t time.Time) int64 { return t.UnixMicro() }, + }, + ).Parse(ui.SpansTableTemplateSrc)) +} + +func formatTime(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + return t.Format(timeFormat) + } + return t.Format(dateAndtimeFormat) +} + +func formatTimeNoMillis(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + const format = "15:04:05" + return t.Format(format) + } + const format = "2006-01-02 15:04:05" + return t.Format(format) +} + +// formatDuration formats a duration in one of the following formats, depending +// on its magnitude. +// 0.001s +// 1.000s +// 1m01s +// 1h05m01s +// 1d02h05m +func formatDuration(d time.Duration) string { + d = d.Round(time.Millisecond) + days := d / (24 * time.Hour) + d -= days * 24 * time.Hour + h := d / time.Hour + d -= h * time.Hour + m := d / time.Minute + d -= m * time.Minute + s := d / time.Second + d -= s * time.Second + millis := d / time.Millisecond + if days != 0 { + return fmt.Sprintf("%dd%02dh%02dm", days, h, m) + } + if h != 0 { + return fmt.Sprintf("%dh%02dm%02ds", h, m, s) + } + if m != 0 { + return fmt.Sprintf("%dm%02ds", m, s) + } + return fmt.Sprintf("%d.%03ds", s, millis) +} + +func serveHTTP(ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var pageErr error + if err := r.ParseForm(); err != nil { + log.Warningf(ctx, "error parsing /tracez form: %s", err.Error()) + pageErr = err + } + + snapID := r.Form.Get("snap") + var snapshot tracing.SpansSnapshot + var snapshotID tracing.SnapshotID + switch snapID { + case "new": + // Capture a new snapshot and return a redirect to that snapshot's ID. + snapshotID = tr.SaveSnapshot() + newURL, err := r.URL.Parse(fmt.Sprintf("?snap=%d", snapshotID)) + if err != nil { + pageErr = err + break + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + case "": + default: + id, err := strconv.Atoi(snapID) + if err != nil { + pageErr = errors.Errorf("invalid snapshot ID: %s", snapID) + break + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + pageErr = err + break + } + } + + // Flatten the recordings. + spans := make([]tracingpb.RecordedSpan, 0, len(snapshot.Traces)*3) + for _, r := range snapshot.Traces { + spans = append(spans, r...) + } + + // Copy the stack traces and augment the map. + stacks := make(map[int]string, len(snapshot.Stacks)) + for k, v := range snapshot.Stacks { + stacks[k] = v + } + // Fill in messages for the goroutines for which we don't have a stack trace. + for _, s := range spans { + gid := int(s.GoroutineID) + if _, ok := stacks[gid]; !ok { + stacks[gid] = "Goroutine not found. Goroutine must have finished since the span was created." + } + } + + if pageErr != nil { + snapshot.Err = pageErr + } + err := spansTableTemplate.ExecuteTemplate(w, "spans-list", pageData{ + Now: timeutil.Now(), + CapturedAt: snapshot.CapturedAt, + SnapshotID: snapshotID, + AllSnapshots: tr.GetSnapshots(), + Err: snapshot.Err, + SpansList: spansList{ + Spans: spans, + Stacks: stacks, + }, + }) + if err != nil { + // We can get a "connection reset by peer" error if the browser requesting + // the page has gone away. + if !sysutil.IsErrConnectionReset(err) { + log.Warningf(ctx, "error executing tracez template: %s", err) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + } +} + +func serveHTTPTrace( + ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer, +) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var err error + if err = r.ParseForm(); err != nil { + goto Error + } + + { + var traceID tracingpb.TraceID + tid := r.Form.Get("trace") + if tid == "" { + err = errors.Errorf("trace ID missing; use ?trace=") + goto Error + } + var id int + id, err = strconv.Atoi(tid) + if err != nil { + err = errors.Errorf("invalid trace ID: %s", id) + goto Error + } + traceID = tracingpb.TraceID(id) + + var snapshotID tracing.SnapshotID + var snapshot tracing.SpansSnapshot + snapID := r.Form.Get("snap") + if snapID != "" { + var id int + id, err = strconv.Atoi(snapID) + if err != nil { + err = errors.Errorf("invalid snapshot ID: %d", snapID) + goto Error + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + goto Error + } + } else { + // If no snapshot is specified, we'll take a new one now and redirect to + // it. + snapshotID = tr.SaveSnapshot() + var newURL *url.URL + newURL, err = r.URL.Parse(fmt.Sprintf("?trace=%d&snap=%d", traceID, snapshotID)) + if err != nil { + goto Error + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + } + + for _, r := range snapshot.Traces { + if r[0].TraceID == traceID { + _, err = w.Write([]byte("
" + r.String() + "\n
")) + if err != nil { + goto Error + } + return + } + } + err = errors.Errorf("trace %d not found in snapshot", traceID) + goto Error + } + +Error: + http.Error(w, err.Error(), http.StatusInternalServerError) +}