Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
107355: cli: enhance diagnosing contention with redacted debug zips r=fqazi a=fqazi

Previously a redacted zip, we would exclude retries and key information for contended keys since they could contain PII data. This patch does the following:

- Adds a new builtin is_system_table_key which allows us to know if a key belongs to a system table
- Modified redacted debug zips to include data for system keys in contention tables conditionally (if they belong to system tables)
- Include retries and last_retry_reason information for queries in cluster insights to help diagnose contention

Fixes: #104593

108454: Revert "kvprober: metamorphically enable / configure kvprober" r=andrewbaptist,joshimhoff a=nvanbenschoten

This reverts (most of) commit 769ba1c.

That commit metamorphically enabled kvprober. This has been observed to be destabliziing to unit tests. When the metamorphic constant is enabled (50% of the time) and when kvprober is fast enough, random ranges will see extra requests that they aren’t expecting. This adds nondeterminism which can trip up tests in any number of different ways.

All of the following flakes have been tracked back to kvprober:

Fixes #107864.
Fixes #108242.
Fixes #108441.
Fixes #108349.
Fixes #108124.
Closes #108366.

Release note: None

Co-authored-by: Faizan Qazi <[email protected]>
Co-authored-by: Nathan VanBenschoten <[email protected]>
  • Loading branch information
3 people committed Aug 9, 2023
3 parents 99d248b + 2d82473 + 1374908 commit b91c919
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 10 deletions.
10 changes: 8 additions & 2 deletions pkg/cli/zip_table_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,12 @@ func (r DebugZipTableRegistry) GetTables() []string {
var zipInternalTablesPerCluster = DebugZipTableRegistry{
"crdb_internal.cluster_contention_events": {
// `key` column contains the contended key, which may contain sensitive
// row-level data.
// row-level data. So, we will only fetch if the table is under the system
// schema.
nonSensitiveCols: NonSensitiveColumns{
"table_id",
"index_id",
"IF(crdb_internal.is_system_table_key(key), crdb_internal.pretty_key(key, 0) ,'redacted') as pretty_key",
"num_contention_events",
"cumulative_contention_time",
"txn_id",
Expand Down Expand Up @@ -158,6 +160,8 @@ var zipInternalTablesPerCluster = DebugZipTableRegistry{
"exec_node_ids",
"contention",
"index_recommendations",
"retries",
"last_retry_reason",
},
},
"crdb_internal.cluster_locks": {
Expand Down Expand Up @@ -531,14 +535,16 @@ var zipInternalTablesPerCluster = DebugZipTableRegistry{
},
"crdb_internal.transaction_contention_events": {
// `contending_key` column contains the contended key, which may
// contain sensitive row-level data.
// contain sensitive row-level data. So, we will only fetch if the
// table is under the system schema.
nonSensitiveCols: NonSensitiveColumns{
"collection_ts",
"blocking_txn_id",
"blocking_txn_fingerprint_id",
"waiting_txn_id",
"waiting_txn_fingerprint_id",
"contention_duration",
"IF(crdb_internal.is_system_table_key(contending_key), crdb_internal.pretty_key(contending_key, 0) ,'redacted') as contending_pretty_key",
},
},
"crdb_internal.zones": {
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvprober/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ go_library(
"//pkg/roachpb",
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/util",
"//pkg/util/log",
"//pkg/util/log/logcrash",
"//pkg/util/metric",
Expand Down
12 changes: 5 additions & 7 deletions pkg/kv/kvprober/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/errors"
)

var defaultEnabled = util.ConstantWithMetamorphicTestBool("kv.prober.*.enabled", false)

// kv.prober.bypass_admission_control controls whether kvprober's requests
// should bypass kv layer's admission control. Setting this value to true
// ensures that kvprober will not be significantly affected if the cluster is
Expand All @@ -30,13 +27,14 @@ var bypassAdmissionControl = settings.RegisterBoolSetting(
"set to bypass admission control queue for kvprober requests; "+
"note that dedicated clusters should have this set as users own capacity planning "+
"but serverless clusters should not have this set as SREs own capacity planning",
util.ConstantWithMetamorphicTestBool("kv.prober.bypass_admission_control.enabled", true))
true,
)

var readEnabled = settings.RegisterBoolSetting(
settings.TenantWritable,
"kv.prober.read.enabled",
"whether the KV read prober is enabled",
defaultEnabled)
false)

// TODO(josh): Another option is for the cluster setting to be a QPS target
// for the cluster as a whole.
Expand Down Expand Up @@ -72,7 +70,7 @@ var writeEnabled = settings.RegisterBoolSetting(
settings.TenantWritable,
"kv.prober.write.enabled",
"whether the KV write prober is enabled",
defaultEnabled)
false)

var writeInterval = settings.RegisterDurationSetting(
settings.TenantWritable,
Expand Down Expand Up @@ -150,7 +148,7 @@ var quarantineWriteEnabled = settings.RegisterBoolSetting(
"quarantine pool holds a separate group of ranges that have previously failed "+
"a probe which are continually probed. This helps determine outages for ranges "+
" with a high level of confidence",
defaultEnabled)
false)

var quarantineWriteInterval = settings.RegisterDurationSetting(
settings.TenantWritable,
Expand Down
5 changes: 5 additions & 0 deletions pkg/sql/faketreeeval/evalctx.go
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,11 @@ func (ep *DummyPrivilegedAccessor) LookupZoneConfigByNamespaceID(
return "", false, errors.WithStack(errEvalPrivileged)
}

// IsSystemTable is part of the tree.PrivilegedAccessor interface.
func (ep *DummyPrivilegedAccessor) IsSystemTable(ctx context.Context, id int64) (bool, error) {
return false, errors.WithStack(errEvalPrivileged)
}

// DummySessionAccessor implements the eval.SessionAccessor interface by returning errors.
type DummySessionAccessor struct{}

Expand Down
10 changes: 10 additions & 0 deletions pkg/sql/privileged_accessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -76,6 +77,15 @@ func (p *planner) LookupZoneConfigByNamespaceID(
return tree.DBytes(zc.GetRawBytesInStorage()), true, nil
}

// IsSystemTable implements tree.PrivilegedAccessor.
func (p *planner) IsSystemTable(ctx context.Context, id int64) (bool, error) {
tbl, err := p.Descriptors().ByID(p.Txn()).Get().Table(ctx, catid.DescID(id))
if err != nil {
return false, err
}
return catalog.IsSystemDescriptor(tbl), nil
}

// checkDescriptorPermissions returns nil if the executing user has permissions
// to check the permissions of a descriptor given its ID, or the id given
// is not a descriptor of a table or database.
Expand Down
32 changes: 32 additions & 0 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -5728,6 +5728,38 @@ SELECT
Volatility: volatility.Immutable,
},
),
// Return if a key belongs to a system table, which should make it to print
// within redacted output.
"crdb_internal.is_system_table_key": makeBuiltin(
tree.FunctionProperties{
Category: builtinconstants.CategorySystemInfo,
Undocumented: true,
},
tree.Overload{
Types: tree.ParamTypes{
{Name: "raw_key", Typ: types.Bytes},
},
ReturnType: tree.FixedReturnType(types.Bool),
Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
_, tableID, err := evalCtx.Codec.DecodeTablePrefix(roachpb.Key(tree.MustBeDBytes(args[0])))
if err != nil {
// If a key isn't prefixed with a table ID ignore.
//nolint:returnerrcheck
return tree.DBoolFalse, nil
}
isSystemTable, err := evalCtx.PrivilegedAccessor.IsSystemTable(ctx, int64(tableID))
if err != nil {
// If we can't find the descriptor or its not the right type then its
// not a system table.
//nolint:returnerrcheck
return tree.DBoolFalse, nil
}
return tree.MakeDBool(tree.DBool(isSystemTable)), nil
},
Info: "This function is used only by CockroachDB's developers for testing purposes.",
Volatility: volatility.Stable,
},
),

// Return a pretty string for a given span, skipping the specified number of
// fields.
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -2439,6 +2439,7 @@ var builtinOidsArray = []string{
2466: `crdb_internal.setup_span_configs_stream(tenant_name: string) -> bytes`,
2467: `crdb_internal.request_statement_bundle(stmtFingerprint: string, planGist: string, samplingProbability: float, minExecutionLatency: interval, expiresAfter: interval) -> bool`,
2468: `crdb_internal.request_statement_bundle(stmtFingerprint: string, planGist: string, antiPlanGist: bool, samplingProbability: float, minExecutionLatency: interval, expiresAfter: interval) -> bool`,
2469: `crdb_internal.is_system_table_key(raw_key: bytes) -> bool`,
}

var builtinOidsBySignature map[string]oid.Oid
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/sem/eval/deps.go
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,9 @@ type PrivilegedAccessor interface {
// Returns the config byte array, a bool representing whether the namespace exists,
// and an error if there is one.
LookupZoneConfigByNamespaceID(ctx context.Context, id int64) (tree.DBytes, bool, error)

// IsSystemTable returns if a given descriptor ID is a system table.s
IsSystemTable(ctx context.Context, id int64) (bool, error)
}

// RegionOperator gives access to the current region, validation for all
Expand Down

0 comments on commit b91c919

Please sign in to comment.