From fc778d97bc8407cb59bf7f4fb83f23593f4b09dd Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 29 Dec 2021 18:18:32 -0500 Subject: [PATCH 1/3] tracing: correct goroutine IDs for async tasks Spans capture the ID of the goroutine that created them, and expose it in instrumentation. This patch makes the goroutine ID mutable, and teaches the Stopper to update it for async tasks. This way, the spans for the async tasks will have the correct goroutine ID - the task's goroutine. Release note: None --- pkg/util/stop/stopper.go | 6 ++++++ pkg/util/tracing/crdbspan.go | 14 ++++++++++++-- pkg/util/tracing/span.go | 11 +++++++++++ pkg/util/tracing/tracer.go | 17 ++++++++--------- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/pkg/util/stop/stopper.go b/pkg/util/stop/stopper.go index ab30e9308d2f..bc0a7ae660b1 100644 --- a/pkg/util/stop/stopper.go +++ b/pkg/util/stop/stopper.go @@ -461,6 +461,11 @@ func (s *Stopper) RunAsyncTaskEx(ctx context.Context, opt TaskOpts, f func(conte } // If the caller has a span, the task gets a child span. + // + // Note that we have to create the child in this parent goroutine; we can't + // defer the creation to the spawned async goroutine since the parent span + // might get Finish()ed by then. However, we'll update the child'd goroutine + // ID. var sp *tracing.Span switch opt.SpanOpt { case FollowsFromSpan: @@ -480,6 +485,7 @@ func (s *Stopper) RunAsyncTaskEx(ctx context.Context, opt TaskOpts, f func(conte defer s.runPostlude() if sp != nil { defer sp.Finish() + sp.UpdateGoroutineIDToCurrent() } if alloc != nil { defer alloc.Release() diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index 5cb5c93ab98f..40e096255635 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -35,7 +35,6 @@ type crdbSpan struct { traceID tracingpb.TraceID // probabilistically unique spanID tracingpb.SpanID // probabilistically unique parentSpanID tracingpb.SpanID - goroutineID uint64 operation string // name of operation associated with the span startTime time.Time @@ -66,6 +65,10 @@ type childRef struct { type crdbSpanMu struct { syncutil.Mutex + // goroutineID is the ID of the goroutine that created this span, or the goroutine that + // subsequently adopted it through Span.UpdateGoroutineIDToCurrent()). + goroutineID uint64 + // parent is the span's local parent, if any. parent is not set if the span is // a root or the parent span is remote. // @@ -527,7 +530,7 @@ func (s *crdbSpan) getRecordingNoChildrenLocked( TraceID: s.traceID, SpanID: s.spanID, ParentSpanID: s.parentSpanID, - GoroutineID: s.goroutineID, + GoroutineID: s.mu.goroutineID, Operation: s.operation, StartTime: s.startTime, Duration: s.mu.duration, @@ -727,6 +730,13 @@ func (s *crdbSpan) withLock(f func()) { f() } +// setGoroutineID updates the span's goroutine ID. +func (s *crdbSpan) setGoroutineID(gid int64) { + s.mu.Lock() + defer s.mu.Unlock() + s.mu.goroutineID = uint64(gid) +} + var sortPool = sync.Pool{ New: func() interface{} { return &Recording{} diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index c2c738274f92..70578a035207 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/petermattis/goid" "go.opentelemetry.io/otel/attribute" oteltrace "go.opentelemetry.io/otel/trace" ) @@ -288,6 +289,16 @@ func (sp *Span) IsSterile() bool { return sp.i.sterile } +// UpdateGoroutineIDToCurrent updates the span's goroutine ID to the current +// goroutine. This should be called when a different goroutine takes ownership +// of a span. +func (sp *Span) UpdateGoroutineIDToCurrent() { + if sp.detectUseAfterFinish() { + return + } + sp.i.crdb.setGoroutineID(goid.Get()) +} + // SpanMeta is information about a Span that is not local to this // process. Typically, SpanMeta is populated from information // about a Span on the other end of an RPC, and is used to derive diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index 1bc4d6aec6ff..5634f71f7159 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -857,7 +857,6 @@ child operation: %s, tracer created at: traceID = tracingpb.TraceID(randutil.FastInt63()) } spanID := tracingpb.SpanID(randutil.FastInt63()) - goroutineID := uint64(goid.Get()) // Now allocate the main *Span and contained crdbSpan. // Allocate these together to save on individual allocs. @@ -876,15 +875,15 @@ child operation: %s, tracer created at: }{} helper.crdbSpan = crdbSpan{ - tracer: t, - traceID: traceID, - spanID: spanID, - goroutineID: goroutineID, - startTime: startTime, - logTags: opts.LogTags, + tracer: t, + traceID: traceID, + spanID: spanID, + startTime: startTime, + logTags: opts.LogTags, mu: crdbSpanMu{ - duration: -1, // unfinished - tags: helper.tagsAlloc[:0], + duration: -1, // unfinished + tags: helper.tagsAlloc[:0], + goroutineID: uint64(goid.Get()), }, } helper.crdbSpan.operation = opName From b2393673f39f00c7d0cc92395bcf5143b9924321 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 21 Dec 2021 19:32:52 -0500 Subject: [PATCH 2/3] tracing: add /debug/tracez rendering the active spans /debug/tracez lets users take a snapshot of the active spans registry and render the new snapshot, or one of the previously taken snapshots. The Tracer can hold up to 10 snapshots in memory. The PR description has a screenshot. When visualizing a snapshot, the page lets you do a number of things: 1. List all the spans. 2. See the (current) stack trace for each span's goroutine (if the goroutine was still running at the time when the snapshot was captured). Stack traces can be toggled visible/hidden. 3. Sort the spans by name or start time. 4. Filter the span according to text search. The search works across the name and stack trace. 5. See the full trace that a particular span is part of. For the table Javascript providing sorting and filtering, this patch embeds the library from https://listjs.com/ . Limitations: - for now, only the registry of the local node is snapshotted. In the fuiture I'll collect info from all nodes. - for now, the relationships between different spans are not represented in any way. I'll work on the ability to go from a span to the whole trace that the span is part of. - for now, tags and structured and unstructured log messages that a span might have are not displayed in any way. At the moment, span creation is not enabled in production by default (i.e. the Tracer is put in TracingModeOnDemand by default, instead of the required TracingModeActiveSpansRegistry). This patch does not change that, so in order to benefit from /debug/tracez in all its glory, one has to run with COCKROACH_REAL_SPANS=1 for now. Not for long, though. Release note: None --- Makefile | 4 +- pkg/server/debug/BUILD.bazel | 2 + pkg/server/debug/server.go | 27 +- pkg/server/server.go | 3 +- pkg/server/tenant.go | 2 +- pkg/ui/.gitignore | 1 + pkg/ui/BUILD.bazel | 18 +- pkg/ui/templates/tracing/html_template.html | 140 +++++++++ pkg/ui/tracing_ui.go | 36 +++ pkg/ui/workspaces/db-console/package.json | 1 + .../views/reports/containers/debug/index.tsx | 1 + pkg/ui/workspaces/db-console/webpack.app.js | 9 + pkg/ui/yarn-vendor | 2 +- pkg/ui/yarn.lock | 12 + pkg/util/tracing/BUILD.bazel | 1 + pkg/util/tracing/tracer.go | 45 ++- pkg/util/tracing/tracer_snapshots.go | 195 ++++++++++++ pkg/util/tracing/tracingpb/recorded_span.go | 19 ++ pkg/util/tracing/tracingui/BUILD.bazel | 17 ++ .../tracing/tracingui/span_registry_ui.go | 277 ++++++++++++++++++ 20 files changed, 786 insertions(+), 26 deletions(-) create mode 100644 pkg/ui/templates/tracing/html_template.html create mode 100644 pkg/ui/tracing_ui.go create mode 100644 pkg/util/tracing/tracer_snapshots.go create mode 100644 pkg/util/tracing/tracingui/BUILD.bazel create mode 100644 pkg/util/tracing/tracingui/span_registry_ui.go diff --git a/Makefile b/Makefile index e7ef845c988d..96cede686950 100644 --- a/Makefile +++ b/Makefile @@ -1211,7 +1211,7 @@ ARCHIVE_EXTRAS = \ $(BUILDINFO) \ $(SQLPARSER_TARGETS) \ $(OPTGEN_TARGETS) \ - pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed + pkg/ui/distccl/assets pkg/ui/distoss/assets pkg/ui/dist_vendor # TODO(benesch): Make this recipe use `git ls-files --recurse-submodules` # instead of scripts/ls-files.sh once Git v2.11 is widely deployed. @@ -1442,7 +1442,7 @@ ui-watch ui-watch-secure: $(UI_CCL_DLLS) pkg/ui/yarn.opt.installed ui-clean: ## Remove build artifacts. find pkg/ui/distccl/assets pkg/ui/distoss/assets -mindepth 1 -not -name .gitkeep -delete rm -rf pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed - rm -rf pkg/ui/dist/* + rm -rf pkg/ui/dist_vendor/* rm -f $(UI_PROTOS_CCL) $(UI_PROTOS_OSS) rm -f pkg/ui/workspaces/db-console/*manifest.json rm -rf pkg/ui/workspaces/cluster-ui/dist diff --git a/pkg/server/debug/BUILD.bazel b/pkg/server/debug/BUILD.bazel index 6f731cd5e7a5..38120156700d 100644 --- a/pkg/server/debug/BUILD.bazel +++ b/pkg/server/debug/BUILD.bazel @@ -29,6 +29,8 @@ go_library( "//pkg/util/log/severity", "//pkg/util/stop", "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingui", "//pkg/util/uint128", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_pebble//tool", diff --git a/pkg/server/debug/server.go b/pkg/server/debug/server.go index 788b99b47b3c..15cdef61d07d 100644 --- a/pkg/server/debug/server.go +++ b/pkg/server/debug/server.go @@ -32,6 +32,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui" "github.com/cockroachdb/errors" pebbletool "github.com/cockroachdb/pebble/tool" metrics "github.com/rcrowley/go-metrics" @@ -61,14 +63,18 @@ var _ = func() *settings.StringSetting { // Server serves the /debug/* family of tools. type Server struct { - st *cluster.Settings - mux *http.ServeMux - spy logSpy + ambientCtx log.AmbientContext + st *cluster.Settings + mux *http.ServeMux + spy logSpy } // NewServer sets up a debug server. func NewServer( - st *cluster.Settings, hbaConfDebugFn http.HandlerFunc, profiler pprofui.Profiler, + ambientContext log.AmbientContext, + st *cluster.Settings, + hbaConfDebugFn http.HandlerFunc, + profiler pprofui.Profiler, ) *Server { mux := http.NewServeMux() @@ -137,9 +143,10 @@ func NewServer( }) return &Server{ - st: st, - mux: mux, - spy: spy, + ambientCtx: ambientContext, + st: st, + mux: mux, + spy: spy, } } @@ -232,6 +239,12 @@ func (ds *Server) RegisterClosedTimestampSideTransport( }) } +// RegisterTracez registers the /debug/tracez handler, which renders snapshots +// of active spans. +func (ds *Server) RegisterTracez(tr *tracing.Tracer) { + tracingui.RegisterHTTPHandlers(ds.ambientCtx, ds.mux, tr) +} + // ServeHTTP serves various tools under the /debug endpoint. func (ds *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { handler, _ := ds.mux.Handler(r) diff --git a/pkg/server/server.go b/pkg/server/server.go index b66783293f9c..4203ed2fa1d7 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -834,7 +834,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { } sStatus.setStmtDiagnosticsRequester(sqlServer.execCfg.StmtDiagnosticsRecorder) sStatus.baseStatusServer.sqlServer = sqlServer - debugServer := debug.NewServer(st, sqlServer.pgServer.HBADebugFn(), sStatus) + debugServer := debug.NewServer(cfg.BaseConfig.AmbientCtx, st, sqlServer.pgServer.HBADebugFn(), sStatus) node.InitLogger(sqlServer.execCfg) *lateBoundServer = Server{ @@ -1958,6 +1958,7 @@ func (s *Server) PreStart(ctx context.Context) error { return errors.Wrapf(err, "failed to register engines with debug server") } s.debug.RegisterClosedTimestampSideTransport(s.ctSender, s.node.storeCfg.ClosedTimestampReceiver) + s.debug.RegisterTracez(s.cfg.Tracer) s.ctSender.Run(ctx, state.nodeID) diff --git a/pkg/server/tenant.go b/pkg/server/tenant.go index c74b18a3913b..bd7c9a2809bc 100644 --- a/pkg/server/tenant.go +++ b/pkg/server/tenant.go @@ -211,7 +211,7 @@ func StartTenant( ) mux := http.NewServeMux() - debugServer := debug.NewServer(args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) + debugServer := debug.NewServer(baseCfg.AmbientCtx, args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) mux.Handle("/", debugServer) mux.Handle("/_status/", gwMux) mux.HandleFunc("/health", func(w http.ResponseWriter, req *http.Request) { diff --git a/pkg/ui/.gitignore b/pkg/ui/.gitignore index 0d5500eded18..a583d996bffd 100644 --- a/pkg/ui/.gitignore +++ b/pkg/ui/.gitignore @@ -14,6 +14,7 @@ assets.oss.installed # Generated intermediates .cache-loader dist*/** +dist_vendor !dist*/_empty_assets/assets/index.html !distccl/assets/.gitkeep !distccl/distccl.go diff --git a/pkg/ui/BUILD.bazel b/pkg/ui/BUILD.bazel index 862ffdf0aed6..144d5d3725a3 100644 --- a/pkg/ui/BUILD.bazel +++ b/pkg/ui/BUILD.bazel @@ -1,8 +1,16 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@build_bazel_rules_nodejs//:index.bzl", "nodejs_binary") go_library( name = "ui", - srcs = ["ui.go"], + srcs = [ + "tracing_ui.go", + "ui.go", + ], + embedsrcs = [ + "dist_vendor/list.min.js", + "templates/tracing/html_template.html", + ], importpath = "github.com/cockroachdb/cockroach/pkg/ui", visibility = ["//visibility:public"], deps = [ @@ -35,3 +43,11 @@ EOF """, visibility = ["//pkg/ui:__subpackages__"], ) + +genrule( + name = "listjs", + srcs = ["@npm//:node_modules/list.js/dist/list.min.js"], + outs = ["dist_vendor/list.min.js"], + cmd = "cp ./$(location @npm//:node_modules/list.js/dist/list.min.js) $@", + tools = ["@npm//list.js"], +) diff --git a/pkg/ui/templates/tracing/html_template.html b/pkg/ui/templates/tracing/html_template.html new file mode 100644 index 000000000000..f66b7a4d6ad0 --- /dev/null +++ b/pkg/ui/templates/tracing/html_template.html @@ -0,0 +1,140 @@ + + + + + + + + + +Take a snapshot of current operations +
+Stored snapshots (ID: capture time): +{{$id := .SnapshotID}} +{{range $i, $s := .AllSnapshots}} + + {{if eq $s.ID $id}} + [current] {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{else}} + {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{end}} + +{{end}} +
+ +

Spans currently open: {{len .SpansList.Spans}}. Snapshot captured at: {{formatTime .CapturedAt}} UTC. Page generated at: {{formatTime .Now}} UTC.

+{{if ne .Err nil}} +

There was an error producing this snapshot; it might be incomplete: {{.Err}}

+{{end}} + +
+ + + + + + + + + + + + +
+ + + + + +
+
+ + + diff --git a/pkg/ui/tracing_ui.go b/pkg/ui/tracing_ui.go new file mode 100644 index 000000000000..000ee4f99ce7 --- /dev/null +++ b/pkg/ui/tracing_ui.go @@ -0,0 +1,36 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package ui + +import ( + "embed" + "io/fs" +) + +// This file deals with embedding assets used by /debug/tracez. + +//go:embed dist_vendor/list.min.js +var listJS embed.FS + +//go:embed templates/tracing/html_template.html +// SpansTableTemplateSrc contains a template used by /debug/tracez +var SpansTableTemplateSrc string + +// ListJS exposes list.js package +var ListJS fs.FS + +func init() { + f, err := fs.Sub(listJS, "dist_vendor") + if err != nil { + panic(err) + } + ListJS = f +} diff --git a/pkg/ui/workspaces/db-console/package.json b/pkg/ui/workspaces/db-console/package.json index 1d1f0d06e2af..600d687f10b7 100644 --- a/pkg/ui/workspaces/db-console/package.json +++ b/pkg/ui/workspaces/db-console/package.json @@ -29,6 +29,7 @@ "connected-react-router": "^6.9.1", "create-react-context": "^0.3.0", "highlight.js": "^10.6.0", + "list.js": "^2.3.1", "lodash": "^4.17.21", "long": "^4.0.0", "mini-create-react-context": "^0.3.2", diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx index 4b827609d132..6bcd38d4dc89 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx @@ -330,6 +330,7 @@ export default function Debug() { + { new CopyWebpackPlugin([ { from: path.resolve(__dirname, "favicon.ico"), to: "favicon.ico" }, ]), + new CopyWebpackPlugin([ + { + from: path.resolve( + __dirname, + "../../node_modules/list.js/dist/list.min.js", + ), + to: path.resolve(__dirname, "../../dist_vendor/list.min.js"), + }, + ]), // use WebpackBar instead of webpack dashboard to fit multiple webpack dev server outputs (db-console and cluster-ui) new WebpackBar({ name: "db-console", diff --git a/pkg/ui/yarn-vendor b/pkg/ui/yarn-vendor index b7cef07d6748..dfceabccc717 160000 --- a/pkg/ui/yarn-vendor +++ b/pkg/ui/yarn-vendor @@ -1 +1 @@ -Subproject commit b7cef07d6748b5af824fb1c1af90cc17f3009c56 +Subproject commit dfceabccc717aff312f17423089652e9e697355b diff --git a/pkg/ui/yarn.lock b/pkg/ui/yarn.lock index 0295930e2b57..c0198cdf4cb7 100644 --- a/pkg/ui/yarn.lock +++ b/pkg/ui/yarn.lock @@ -11461,6 +11461,13 @@ linkify-it@^2.0.0: dependencies: uc.micro "^1.0.1" +list.js@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/list.js/-/list.js-2.3.1.tgz#48961989ffe52b0505e352f7a521f819f51df7e7" + integrity sha512-jnmm7DYpKtH3DxtO1E2VNCC9Gp7Wrp/FWA2JxQrZUhVJ2RCQBd57pCN6W5w6jpsfWZV0PCAbTX2NOPgyFeeZZg== + dependencies: + string-natural-compare "^2.0.2" + listr-silent-renderer@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e" @@ -16399,6 +16406,11 @@ string-length@^3.1.0: astral-regex "^1.0.0" strip-ansi "^5.2.0" +string-natural-compare@^2.0.2: + version "2.0.3" + resolved "https://registry.yarnpkg.com/string-natural-compare/-/string-natural-compare-2.0.3.tgz#9dbe1dd65490a5fe14f7a5c9bc686fc67cb9c6e4" + integrity sha512-4Kcl12rNjc+6EKhY8QyDVuQTAlMWwRiNbsxnVwBUKFr7dYPQuXVrtNU4sEkjF9LHY0AY6uVbB3ktbkIH4LC+BQ== + string-replace-webpack-plugin@^0.1.3: version "0.1.3" resolved "https://registry.yarnpkg.com/string-replace-webpack-plugin/-/string-replace-webpack-plugin-0.1.3.tgz#73c657e759d66cfe80ae1e0cf091aa256d0e715c" diff --git a/pkg/util/tracing/BUILD.bazel b/pkg/util/tracing/BUILD.bazel index fa4c8a23bc83..3210ab249ec2 100644 --- a/pkg/util/tracing/BUILD.bazel +++ b/pkg/util/tracing/BUILD.bazel @@ -14,6 +14,7 @@ go_library( "tags.go", "test_utils.go", "tracer.go", + "tracer_snapshots.go", "utils.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing", diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index 5634f71f7159..91021a7ad676 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -28,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/iterutil" "github.com/cockroachdb/cockroach/pkg/util/netutil/addr" "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/util/ring" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" @@ -59,6 +60,9 @@ const ( // maxLogsPerSpanExternal limits the number of logs in a Span for external // tracers (net/trace, OpenTelemetry); use a comfortable limit. maxLogsPerSpanExternal = 1000 + // maxSnapshots limits the number of snapshots that a Tracer will hold in + // memory. Beyond this limit, each new snapshot evicts the oldest one. + maxSnapshots = 10 ) // These constants are used to form keys to represent tracing context @@ -239,7 +243,14 @@ type Tracer struct { // activeSpans is a map that references all non-Finish'ed local root spans, // i.e. those for which no WithParent() option was supplied. - activeSpansRegistry *spanRegistry + activeSpansRegistry *SpanRegistry + snapshotsMu struct { + syncutil.Mutex + // snapshots stores the activeSpansRegistry snapshots taken during the + // Tracer's lifetime. The ring buffer will contain snapshots with contiguous + // IDs, from the oldest one to + maxSnapshots - 1. + snapshots ring.Buffer // snapshotWithID + } testingMu syncutil.Mutex // protects testingRecordAsyncSpans testingRecordAsyncSpans bool // see TestingRecordAsyncSpans @@ -263,7 +274,7 @@ type Tracer struct { stack string } -// spanRegistry is a map that references all non-Finish'ed local root spans, +// SpanRegistry is a map that references all non-Finish'ed local root spans, // i.e. those for which no WithLocalParent() option was supplied. The // map is keyed on the span ID, which is deterministically unique. // @@ -272,30 +283,30 @@ type Tracer struct { // // The map can be introspected by `Tracer.VisitSpans`. A Span can also be // retrieved from its ID by `Tracer.GetActiveSpanByID`. -type spanRegistry struct { +type SpanRegistry struct { mu struct { syncutil.Mutex m map[tracingpb.SpanID]*crdbSpan } } -func makeSpanRegistry() *spanRegistry { - r := &spanRegistry{} +func makeSpanRegistry() *SpanRegistry { + r := &SpanRegistry{} r.mu.m = make(map[tracingpb.SpanID]*crdbSpan) return r } -func (r *spanRegistry) removeSpanLocked(id tracingpb.SpanID) { +func (r *SpanRegistry) removeSpanLocked(id tracingpb.SpanID) { delete(r.mu.m, id) } -func (r *spanRegistry) addSpan(s *crdbSpan) { +func (r *SpanRegistry) addSpan(s *crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.addSpanLocked(s) } -func (r *spanRegistry) addSpanLocked(s *crdbSpan) { +func (r *SpanRegistry) addSpanLocked(s *crdbSpan) { // Ensure that the registry does not grow unboundedly in case there is a leak. // When the registry reaches max size, each new span added kicks out an // arbitrary existing span. We rely on map iteration order here to make this @@ -310,7 +321,7 @@ func (r *spanRegistry) addSpanLocked(s *crdbSpan) { } // getSpanByID looks up a span in the registry. Returns nil if not found. -func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { +func (r *SpanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { r.mu.Lock() defer r.mu.Unlock() crdbSpan, ok := r.mu.m[id] @@ -321,7 +332,10 @@ func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { return crdbSpan } -func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { +// VisitSpans calls the visitor callback for every local root span in the +// registry. Iterations stops when the visitor returns an error. If that error +// is iterutils.StopIteration(), then VisitSpans() returns nil. +func (r *SpanRegistry) VisitSpans(visitor func(span RegistrySpan) error) error { r.mu.Lock() sl := make([]*crdbSpan, 0, len(r.mu.m)) for _, sp := range r.mu.m { @@ -343,7 +357,7 @@ func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { // testingAll returns (pointers to) all the spans in the registry, in an // arbitrary order. Since spans can generally finish at any point and use of a // finished span is not permitted, this method is only suitable for tests. -func (r *spanRegistry) testingAll() []*crdbSpan { +func (r *SpanRegistry) testingAll() []*crdbSpan { r.mu.Lock() defer r.mu.Unlock() res := make([]*crdbSpan, 0, len(r.mu.m)) @@ -358,7 +372,7 @@ func (r *spanRegistry) testingAll() []*crdbSpan { // removing the parent from the registry, the children are accessible in the // registry through that parent; if we didn't do this swap when the parent is // removed, the children would not be part of the registry anymore. -func (r *spanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { +func (r *SpanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.removeSpanLocked(parentID) @@ -1138,7 +1152,7 @@ func (t *Tracer) GetActiveSpanByID(spanID tracingpb.SpanID) RegistrySpan { // VisitSpans invokes the visitor with all active Spans. The function will // gracefully exit if the visitor returns iterutil.StopIteration(). func (t *Tracer) VisitSpans(visitor func(span RegistrySpan) error) error { - return t.activeSpansRegistry.visitSpans(visitor) + return t.activeSpansRegistry.VisitSpans(visitor) } // TestingRecordAsyncSpans is a test-only helper that configures @@ -1179,6 +1193,11 @@ func (t *Tracer) PanicOnUseAfterFinish() bool { return t.panicOnUseAfterFinish } +// SpanRegistry exports the registry containing all currently-open spans. +func (t *Tracer) SpanRegistry() *SpanRegistry { + return t.activeSpansRegistry +} + // ForkSpan forks the current span, if any[1]. Forked spans "follow from" the // original, and are typically used to trace operations that may outlive the // parent (think async tasks). See the package-level documentation for more diff --git a/pkg/util/tracing/tracer_snapshots.go b/pkg/util/tracing/tracer_snapshots.go new file mode 100644 index 000000000000..8f4ce784202f --- /dev/null +++ b/pkg/util/tracing/tracer_snapshots.go @@ -0,0 +1,195 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracing + +import ( + "bufio" + "runtime" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" +) + +// SpansSnapshot represents a snapshot of all the open spans at a certain point +// in time. +type SpansSnapshot struct { + // CapturedAt is the time when the snapshot was collected. + CapturedAt time.Time + // Traces contains the collected traces. Each "local route" corresponds to one + // trace. "Detached recording" spans are included in the parent's trace. + Traces []Recording + // Stacks is a map from groutine ID to the goroutine's stack trace. All + // goroutines running at the time when this snapshot is produced are + // represented here. A goroutine referenced by a span's GoroutineID field will + // not be present if it has finished since the respective span was started. + Stacks map[int]string + // Err is set if an error was encountered when producing the snapshot. The + // snapshot will be incomplete in this case. + Err error +} + +// SnapshotID identifies a spans snapshot. The ID can be used to retrieve a +// specific snapshot from the Tracer. +type SnapshotID int +type snapshotWithID struct { + ID SnapshotID + SpansSnapshot +} + +// SaveSnapshot collects a snapshot of the currently open spans (i.e. the +// contents of the Tracer's active spans registry) saves it in-memory in the +// Tracer. The snapshot's ID is returned; the snapshot can be retrieved +// subsequently through t.GetSnapshot(id). +// +// Snapshots also include a dump of all the goroutine stack traces. +func (t *Tracer) SaveSnapshot() SnapshotID { + snap := t.generateSnapshot() + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == maxSnapshots { + snapshots.RemoveFirst() + } + var id SnapshotID + if snapshots.Len() == 0 { + id = 1 + } else { + id = snapshots.GetLast().(snapshotWithID).ID + 1 + } + snapshots.AddLast(snapshotWithID{ + ID: id, + SpansSnapshot: snap, + }) + return id +} + +var errSnapshotTooOld = errors.Newf( + "the requested snapshot is too old and has been deleted. "+ + "Only the last %d snapshots are stored.", maxSnapshots) + +var errSnapshotDoesntExist = errors.New("the requested snapshot doesn't exist") + +// GetSnapshot returns the snapshot with the given ID. If the ID is below the +// minimum stored snapshot, then the requested snapshot must have been +// garbage-collected and errSnapshotTooOld is returned. If the snapshot id is +// beyond the maximum stored ID, errSnapshotDoesntExist is returned. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) GetSnapshot(id SnapshotID) (SpansSnapshot, error) { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == 0 { + return SpansSnapshot{}, errSnapshotDoesntExist + } + minID := snapshots.GetFirst().(snapshotWithID).ID + if id < minID { + return SpansSnapshot{}, errSnapshotTooOld + } + maxID := snapshots.GetLast().(snapshotWithID).ID + if id > maxID { + return SpansSnapshot{}, errSnapshotDoesntExist + } + + return snapshots.Get(int(id - minID)).(snapshotWithID).SpansSnapshot, nil +} + +// SnapshotInfo represents minimal info about a stored snapshot, as returned by +// Tracer.GetSnapshots(). +type SnapshotInfo struct { + ID SnapshotID + CapturedAt time.Time +} + +// GetSnapshots returns info on all stored span snapshots. +func (t *Tracer) GetSnapshots() []SnapshotInfo { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + res := make([]SnapshotInfo, snapshots.Len()) + for i := 0; i < snapshots.Len(); i++ { + s := snapshots.Get(i).(snapshotWithID) + res[i] = SnapshotInfo{ + ID: s.ID, + CapturedAt: s.CapturedAt, + } + } + return res +} + +// generateSnapshot produces a snapshot of all the currently open spans and +// all the current goroutine stacktraces. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) generateSnapshot() SpansSnapshot { + capturedAt := timeutil.Now() + // Collect the traces. + traces := make([]Recording, 0, 1000) + _ = t.SpanRegistry().VisitSpans(func(sp RegistrySpan) error { + rec := sp.GetFullRecording(RecordingVerbose) + traces = append(traces, rec) + return nil + }) + + // Collect and parse the goroutine stack traces. + + // We don't know how big the traces are, so grow a few times if they don't + // fit. Start large, though. + var stacks []byte + for n := 1 << 20; /* 1mb */ n <= (1 << 29); /* 512mb */ n *= 2 { + stacks = make([]byte, n) + nbytes := runtime.Stack(stacks, true /* all */) + if nbytes < len(stacks) { + break + } + } + + splits := strings.Split(string(stacks), "\n\n") + stackMap := make(map[int]string, len(splits)) + var parseErr error + for _, s := range splits { + // Parse the goroutine ID. The first line of each stack is expected to look like: + // goroutine 115 [chan receive]: + scanner := bufio.NewScanner(strings.NewReader(s)) + scanner.Split(bufio.ScanWords) + // Skip the word "goroutine". + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + // Scan the goroutine ID. + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + goroutineID, err := strconv.Atoi(scanner.Text()) + if err != nil { + panic(err) + } + stackMap[goroutineID] = s + } + + return SpansSnapshot{ + CapturedAt: capturedAt, + Traces: traces, + Stacks: stackMap, + Err: parseErr, + } +} diff --git a/pkg/util/tracing/tracingpb/recorded_span.go b/pkg/util/tracing/tracingpb/recorded_span.go index b398962c941d..2bb4862a5104 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.go +++ b/pkg/util/tracing/tracingpb/recorded_span.go @@ -25,6 +25,25 @@ type TraceID uint64 // SpanID is a probabilistically-unique span id. type SpanID uint64 +// Recording represents a group of RecordedSpans rooted at a fixed root span, as +// returned by GetRecording. Spans are sorted by StartTime. +type Recording []RecordedSpan + +// Less implements sort.Interface. +func (r Recording) Less(i, j int) bool { + return r[i].StartTime.Before(r[j].StartTime) +} + +// Swap implements sort.Interface. +func (r Recording) Swap(i, j int) { + r[i], r[j] = r[j], r[i] +} + +// Len implements sort.Interface. +func (r Recording) Len() int { + return len(r) +} + // LogMessageField is the field name used for the log message in a LogRecord. const LogMessageField = "event" diff --git a/pkg/util/tracing/tracingui/BUILD.bazel b/pkg/util/tracing/tracingui/BUILD.bazel new file mode 100644 index 000000000000..b225899b887e --- /dev/null +++ b/pkg/util/tracing/tracingui/BUILD.bazel @@ -0,0 +1,17 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "tracingui", + srcs = ["span_registry_ui.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui", + visibility = ["//visibility:public"], + deps = [ + "//pkg/ui", + "//pkg/util/log", + "//pkg/util/sysutil", + "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingpb", + "@com_github_cockroachdb_errors//:errors", + ], +) diff --git a/pkg/util/tracing/tracingui/span_registry_ui.go b/pkg/util/tracing/tracingui/span_registry_ui.go new file mode 100644 index 000000000000..cdf23044f7a0 --- /dev/null +++ b/pkg/util/tracing/tracingui/span_registry_ui.go @@ -0,0 +1,277 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracingui + +import ( + "context" + "fmt" + "html/template" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/ui" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/cockroachdb/errors" +) + +// This file deals with producing the /debug/tracez page, which lists +// a snapshot of the spans in the Tracer's active spans registry. + +const dateAndtimeFormat = "2006-01-02 15:04:05.000" +const timeFormat = "15:04:05.000" + +// RegisterHTTPHandlers registers the /debug/tracez handlers, and helpers. +func RegisterHTTPHandlers(ambientCtx log.AmbientContext, mux *http.ServeMux, tr *tracing.Tracer) { + fileServer := http.StripPrefix("/debug/assets/", http.FileServer(http.FS(ui.ListJS))) + mux.HandleFunc("/debug/tracez", + func(w http.ResponseWriter, req *http.Request) { + serveHTTP(ambientCtx.AnnotateCtx(context.Background()), w, req, tr) + }) + mux.HandleFunc("/debug/show-trace", + func(w http.ResponseWriter, req *http.Request) { + serveHTTPTrace(ambientCtx.AnnotateCtx(req.Context()), w, req, tr) + }) + mux.HandleFunc("/debug/assets/list.min.js", fileServer.ServeHTTP) +} + +type pageData struct { + SnapshotID tracing.SnapshotID + Now time.Time + CapturedAt time.Time + Err error + AllSnapshots []tracing.SnapshotInfo + SpansList spansList +} + +type spansList struct { + Spans []tracingpb.RecordedSpan + // Stacks contains stack traces for the goroutines referenced by the Spans + // through their GoroutineID field. + Stacks map[int]string // GoroutineID to stack trace +} + +var spansTableTemplate *template.Template + +func init() { + spansTableTemplate = template.Must(template.New("spans-list").Funcs( + template.FuncMap{ + "formatTime": formatTime, + "formatTimeNoMillis": formatTimeNoMillis, + "since": func(t time.Time, capturedAt time.Time) string { + return fmt.Sprintf("(%s ago)", formatDuration(capturedAt.Sub(t))) + }, + "timeRaw": func(t time.Time) int64 { return t.UnixMicro() }, + }, + ).Parse(ui.SpansTableTemplateSrc)) +} + +func formatTime(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + return t.Format(timeFormat) + } + return t.Format(dateAndtimeFormat) +} + +func formatTimeNoMillis(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + const format = "15:04:05" + return t.Format(format) + } + const format = "2006-01-02 15:04:05" + return t.Format(format) +} + +// formatDuration formats a duration in one of the following formats, depending +// on its magnitude. +// 0.001s +// 1.000s +// 1m01s +// 1h05m01s +// 1d02h05m +func formatDuration(d time.Duration) string { + d = d.Round(time.Millisecond) + days := d / (24 * time.Hour) + d -= days * 24 * time.Hour + h := d / time.Hour + d -= h * time.Hour + m := d / time.Minute + d -= m * time.Minute + s := d / time.Second + d -= s * time.Second + millis := d / time.Millisecond + if days != 0 { + return fmt.Sprintf("%dd%02dh%02dm", days, h, m) + } + if h != 0 { + return fmt.Sprintf("%dh%02dm%02ds", h, m, s) + } + if m != 0 { + return fmt.Sprintf("%dm%02ds", m, s) + } + return fmt.Sprintf("%d.%03ds", s, millis) +} + +func serveHTTP(ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var pageErr error + if err := r.ParseForm(); err != nil { + log.Warningf(ctx, "error parsing /tracez form: %s", err.Error()) + pageErr = err + } + + snapID := r.Form.Get("snap") + var snapshot tracing.SpansSnapshot + var snapshotID tracing.SnapshotID + switch snapID { + case "new": + // Capture a new snapshot and return a redirect to that snapshot's ID. + snapshotID = tr.SaveSnapshot() + newURL, err := r.URL.Parse(fmt.Sprintf("?snap=%d", snapshotID)) + if err != nil { + pageErr = err + break + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + case "": + default: + id, err := strconv.Atoi(snapID) + if err != nil { + pageErr = errors.Errorf("invalid snapshot ID: %s", snapID) + break + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + pageErr = err + break + } + } + + // Flatten the recordings. + spans := make([]tracingpb.RecordedSpan, 0, len(snapshot.Traces)*3) + for _, r := range snapshot.Traces { + spans = append(spans, r...) + } + + // Copy the stack traces and augment the map. + stacks := make(map[int]string, len(snapshot.Stacks)) + for k, v := range snapshot.Stacks { + stacks[k] = v + } + // Fill in messages for the goroutines for which we don't have a stack trace. + for _, s := range spans { + gid := int(s.GoroutineID) + if _, ok := stacks[gid]; !ok { + stacks[gid] = "Goroutine not found. Goroutine must have finished since the span was created." + } + } + + if pageErr != nil { + snapshot.Err = pageErr + } + err := spansTableTemplate.ExecuteTemplate(w, "spans-list", pageData{ + Now: timeutil.Now(), + CapturedAt: snapshot.CapturedAt, + SnapshotID: snapshotID, + AllSnapshots: tr.GetSnapshots(), + Err: snapshot.Err, + SpansList: spansList{ + Spans: spans, + Stacks: stacks, + }, + }) + if err != nil { + // We can get a "connection reset by peer" error if the browser requesting + // the page has gone away. + if !sysutil.IsErrConnectionReset(err) { + log.Warningf(ctx, "error executing tracez template: %s", err) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + } +} + +func serveHTTPTrace( + ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer, +) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var err error + if err = r.ParseForm(); err != nil { + goto Error + } + + { + var traceID tracingpb.TraceID + tid := r.Form.Get("trace") + if tid == "" { + err = errors.Errorf("trace ID missing; use ?trace=") + goto Error + } + var id int + id, err = strconv.Atoi(tid) + if err != nil { + err = errors.Errorf("invalid trace ID: %s", tid) + goto Error + } + traceID = tracingpb.TraceID(id) + + var snapshotID tracing.SnapshotID + var snapshot tracing.SpansSnapshot + snapID := r.Form.Get("snap") + if snapID != "" { + var id int + id, err = strconv.Atoi(snapID) + if err != nil { + err = errors.Errorf("invalid snapshot ID: %s", snapID) + goto Error + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + goto Error + } + } else { + // If no snapshot is specified, we'll take a new one now and redirect to + // it. + snapshotID = tr.SaveSnapshot() + var newURL *url.URL + newURL, err = r.URL.Parse(fmt.Sprintf("?trace=%d&snap=%d", traceID, snapshotID)) + if err != nil { + goto Error + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + } + + for _, r := range snapshot.Traces { + if r[0].TraceID == traceID { + _, err = w.Write([]byte("
" + r.String() + "\n
")) + if err != nil { + goto Error + } + return + } + } + err = errors.Errorf("trace %d not found in snapshot", traceID) + goto Error + } + +Error: + http.Error(w, err.Error(), http.StatusInternalServerError) +} From 73caccf8f12bfb83cbb3a46aaf0426863ed6d393 Mon Sep 17 00:00:00 2001 From: Andrii Vorobiov Date: Thu, 20 Jan 2022 11:17:18 +0200 Subject: [PATCH 3/3] ui: resolve list.js dependency for Db Console bazel build Currently, to manage proper JS dependency resolution with bazel, we need to do one more extra step (which isn't automated yet): explicitly specify the list of dependencies required by Db Console in its BUILD.bazel file. Another issue that is solved with this change is path resolution for "node_modules" in Webpack config file. Path resolution for NPM modules in webpack relies on project structure and it doesn't work well with the way bazel places outputs. Release note: None --- pkg/ui/workspaces/db-console/BUILD.bazel | 1 + pkg/ui/workspaces/db-console/webpack.app.js | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/ui/workspaces/db-console/BUILD.bazel b/pkg/ui/workspaces/db-console/BUILD.bazel index 75b99f44106d..494064a475c6 100644 --- a/pkg/ui/workspaces/db-console/BUILD.bazel +++ b/pkg/ui/workspaces/db-console/BUILD.bazel @@ -130,6 +130,7 @@ DEPENDENCIES = [ "@npm//highlight.js", "@npm//lodash", "@npm//long", + "@npm//list.js", "@npm//mini-create-react-context", "@npm//moment", "@npm//nvd3", diff --git a/pkg/ui/workspaces/db-console/webpack.app.js b/pkg/ui/workspaces/db-console/webpack.app.js index b7991bbd5e22..7fdf1c5fc339 100644 --- a/pkg/ui/workspaces/db-console/webpack.app.js +++ b/pkg/ui/workspaces/db-console/webpack.app.js @@ -52,12 +52,11 @@ module.exports = (env, argv) => { new RemoveBrokenDependenciesPlugin(), new CopyWebpackPlugin([ { from: path.resolve(__dirname, "favicon.ico"), to: "favicon.ico" }, - ]), - new CopyWebpackPlugin([ { from: path.resolve( - __dirname, - "../../node_modules/list.js/dist/list.min.js", + !isBazelBuild ? __dirname : "", + !isBazelBuild ? "../.." : "", + "node_modules/list.js/dist/list.min.js", ), to: path.resolve(__dirname, "../../dist_vendor/list.min.js"), },