Skip to content

Commit

Permalink
Merge #59717 #59723
Browse files Browse the repository at this point in the history
59717: util/tracing, sql: expose span's goroutine IDs r=irfansharif a=angelapwen

Resolves #59200. 

This commit adds a goroutine ID to each span's recording, as well
as as a column in the `crdb_internal.node_inflight_trace_spans`
virtual table. The goroutine ID is retrieved upon span creation.

Release note (sql change): adds `goroutine_id` column to the
`crdb_internal.node_inflight_trace_spans` virtual table that
represents the goroutine ID associated with a particular span.

59723: importccl: fix bug when rolling back imports with no timestamp r=dt a=pbardea

If a timestamp for the import was not chosen, there should be no
ingested data to rollback. However, we do want to continue with the
IMPORT cleanup to bring any OFFLINE tables back to PUBLIC.

Fixes #59481.

Release note (bug fix): Fixes a bug where some import failures would
cause tables to stay OFFLINE, when they should have been brough back to
PUBLIC.


Co-authored-by: angelapwen <[email protected]>
Co-authored-by: Paul Bardea <[email protected]>
  • Loading branch information
3 people committed Feb 3, 2021
3 parents 7efe879 + 308dc98 + 0f0dd9e commit a59f0c2
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 66 deletions.
29 changes: 16 additions & 13 deletions pkg/ccl/importccl/import_stmt.go
Original file line number Diff line number Diff line change
Expand Up @@ -1658,19 +1658,22 @@ func (r *importResumer) dropTables(
}
}

// NB: if a revert fails it will abort the rest of this failure txn, which is
// also what brings tables back online. We _could_ change the error handling
// or just move the revert into Resume()'s error return path, however it isn't
// clear that just bringing a table back online with partially imported data
// that may or may not be partially reverted is actually a good idea. It seems
// better to do the revert here so that the table comes back if and only if,
// it was rolled back to its pre-IMPORT state, and instead provide a manual
// admin knob (e.g. ALTER TABLE REVERT TO SYSTEM TIME) if anything goes wrong.
if len(revert) > 0 {
// Sanity check Walltime so it doesn't become a TRUNCATE if there's a bug.
if details.Walltime == 0 {
return errors.Errorf("invalid pre-IMPORT time to rollback")
}
// The walltime can be 0 if there is a failure between publishing the tables
// as OFFLINE and then choosing a ingestion timestamp. This might happen
// while waiting for the descriptor version to propagate across the cluster
// for example.
//
// In this case, we don't want to rollback the data since data ingestion has
// not yet begun (since we have not chosen a timestamp at which to ingest.)
if details.Walltime != 0 && len(revert) > 0 {
// NB: if a revert fails it will abort the rest of this failure txn, which is
// also what brings tables back online. We _could_ change the error handling
// or just move the revert into Resume()'s error return path, however it isn't
// clear that just bringing a table back online with partially imported data
// that may or may not be partially reverted is actually a good idea. It seems
// better to do the revert here so that the table comes back if and only if,
// it was rolled back to its pre-IMPORT state, and instead provide a manual
// admin knob (e.g. ALTER TABLE REVERT TO SYSTEM TIME) if anything goes wrong.
ts := hlc.Timestamp{WallTime: details.Walltime}.Prev()
if err := sql.RevertTables(ctx, txn.DB(), execCfg, revert, ts, sql.RevertTableDefaultBatchSize); err != nil {
return errors.Wrap(err, "rolling back partially completed IMPORT")
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,7 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans (
trace_id INT NOT NULL, -- The trace's ID.
parent_span_id INT NOT NULL, -- The span's parent ID.
span_id INT NOT NULL, -- The span's ID.
goroutine_id INT NOT NULL, -- The ID of the goroutine on which the span was created.
start_time TIMESTAMPTZ, -- The span's start time.
duration INTERVAL, -- The span's duration, measured by time of
-- collection - start time for all in-flight spans.
Expand All @@ -1153,6 +1154,7 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans (
traceID := rec.TraceID
parentSpanID := rec.ParentSpanID
spanID := rec.SpanID
goroutineID := rec.GoroutineID

startTime, err := tree.MakeDTimestampTZ(rec.StartTime, time.Microsecond)
if err != nil {
Expand All @@ -1166,6 +1168,7 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans (
tree.NewDInt(tree.DInt(traceID)),
tree.NewDInt(tree.DInt(parentSpanID)),
tree.NewDInt(tree.DInt(spanID)),
tree.NewDInt(tree.DInt(goroutineID)),
startTime,
tree.NewDInterval(
duration.MakeDuration(spanDuration.Nanoseconds(), 0, 0),
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,10 @@ SELECT * FROM crdb_internal.zones WHERE false
zone_id subzone_id target range_name database_name table_name index_name partition_name
raw_config_yaml raw_config_sql raw_config_protobuf full_config_yaml full_config_sql

query IIITTT colnames
query IIIITTT colnames
SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0
----
trace_id parent_span_id span_id start_time duration operation
trace_id parent_span_id span_id goroutine_id start_time duration operation

query ITTTTTTTTTTTTI colnames
SELECT * FROM crdb_internal.ranges WHERE range_id < 0
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,10 @@ SELECT * FROM crdb_internal.zones WHERE false
zone_id subzone_id target range_name database_name table_name index_name partition_name
raw_config_yaml raw_config_sql raw_config_protobuf full_config_yaml full_config_sql

query IIITTT colnames
query IIIITTT colnames
SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0
----
trace_id parent_span_id span_id start_time duration operation
trace_id parent_span_id span_id goroutine_id start_time duration operation

statement error not fully contained in tenant keyspace
SELECT * FROM crdb_internal.ranges WHERE range_id < 0
Expand Down
1 change: 1 addition & 0 deletions pkg/util/tracing/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ go_library(
"@com_github_opentracing_opentracing_go//ext",
"@com_github_opentracing_opentracing_go//log",
"@com_github_openzipkin_contrib_zipkin_go_opentracing//:zipkin-go-opentracing",
"@com_github_petermattis_goid//:goid",
"@org_golang_google_grpc//:go_default_library",
"@org_golang_google_grpc//codes",
"@org_golang_google_grpc//metadata",
Expand Down
2 changes: 2 additions & 0 deletions pkg/util/tracing/crdbspan.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type crdbSpan struct {
traceID uint64 // probabilistically unique.
spanID uint64 // probabilistically unique.
parentSpanID uint64
goroutineID uint64

operation string
startTime time.Time
Expand Down Expand Up @@ -241,6 +242,7 @@ func (s *crdbSpan) getRecordingLocked(m mode) tracingpb.RecordedSpan {
TraceID: s.traceID,
SpanID: s.spanID,
ParentSpanID: s.parentSpanID,
GoroutineID: s.goroutineID,
Operation: s.operation,
StartTime: s.startTime,
Duration: s.mu.duration,
Expand Down
3 changes: 3 additions & 0 deletions pkg/util/tracing/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/logtags"
opentracing "github.com/opentracing/opentracing-go"
"github.com/petermattis/goid"
"golang.org/x/net/trace"
)

Expand Down Expand Up @@ -364,6 +365,7 @@ func (t *Tracer) startSpanGeneric(
traceID = uint64(rand.Int63())
}
spanID := uint64(rand.Int63())
goroutineID := uint64(goid.Get())

// Now allocate the main *Span and contained crdbSpan.
// Allocate these together to save on individual allocs.
Expand All @@ -380,6 +382,7 @@ func (t *Tracer) startSpanGeneric(
helper.crdbSpan = crdbSpan{
traceID: traceID,
spanID: spanID,
goroutineID: goroutineID,
operation: opName,
startTime: startTime,
parentSpanID: opts.parentSpanID(),
Expand Down
129 changes: 80 additions & 49 deletions pkg/util/tracing/tracingpb/recorded_span.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkg/util/tracing/tracingpb/recorded_span.proto
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ message RecordedSpan {
//
// TODO(tbg): rename once DeprecatedStats is removed.
repeated google.protobuf.Any internal_structured = 11;

// The ID of the goroutine on which the span was created.
uint64 goroutine_id = 12 [(gogoproto.customname) = "GoroutineID"];
}

// NormalizedSpan is a representation of a RecordedSpan from a trace with all
Expand Down

0 comments on commit a59f0c2

Please sign in to comment.