From 0d4c5bca27750801b05ed4d0cb23b38443d130aa Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Thu, 30 Mar 2023 09:57:53 -0400 Subject: [PATCH 1/2] spanconfigs: enable span config bounds by default We only clamp spanconfigs if the `spanconfig.bounds.enabled` cluster setting dictates as such. For 23.1, it's default off, but going forward we want it to be default on. Let's also make this cluster setting public. Epic: none Epic: CRDB-8035 Release note: None --- docs/generated/settings/settings.html | 1 + pkg/spanconfig/spanconfigstore/store.go | 4 ++-- pkg/spanconfig/spanconfigstore/store_test.go | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 01de5764264d..995a792ffdac 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -117,6 +117,7 @@
server.user_login.upgrade_bcrypt_stored_passwords_to_scram.enabled
booleantrueif server.user_login.password_encryption=scram-sha-256, this controls whether to automatically re-encode stored passwords using crdb-bcrypt to scram-sha-256
server.web_session.purge.ttl
duration1h0m0sif nonzero, entries in system.web_sessions older than this duration are periodically purged
server.web_session_timeout
duration168h0m0sthe duration that a newly created web session will be valid +
spanconfig.bounds.enabled
booleantruedictates whether span config bounds are consulted when serving span configs for secondary tenants
sql.auth.change_own_password.enabled
booleanfalsecontrols whether a user is allowed to change their own password, even if they have no other privileges
sql.auth.resolve_membership_single_scan.enabled
booleantruedetermines whether to populate the role membership cache with a single scan
sql.closed_session_cache.capacity
integer1000the maximum number of sessions in the cache diff --git a/pkg/spanconfig/spanconfigstore/store.go b/pkg/spanconfig/spanconfigstore/store.go index 0eda3a8e32c4..c74fa8809fc6 100644 --- a/pkg/spanconfig/spanconfigstore/store.go +++ b/pkg/spanconfig/spanconfigstore/store.go @@ -54,8 +54,8 @@ var boundsEnabled = settings.RegisterBoolSetting( settings.SystemOnly, "spanconfig.bounds.enabled", "dictates whether span config bounds are consulted when serving span configs for secondary tenants", - false, -) + true, +).WithPublic() // Store is an in-memory data structure to store, retrieve, and incrementally // update the span configuration state. Internally, it makes use of an interval diff --git a/pkg/spanconfig/spanconfigstore/store_test.go b/pkg/spanconfig/spanconfigstore/store_test.go index d17491b06306..97be4b697639 100644 --- a/pkg/spanconfig/spanconfigstore/store_test.go +++ b/pkg/spanconfig/spanconfigstore/store_test.go @@ -118,12 +118,10 @@ func TestDataDriven(t *testing.T) { ctx := context.Background() boundsReader := newMockBoundsReader() - settings := cluster.MakeTestingClusterSettings() - boundsEnabled.Override(ctx, &settings.SV, true) datadriven.Walk(t, datapathutils.TestDataPath(t), func(t *testing.T, path string) { store := New( spanconfigtestutils.ParseConfig(t, "FALLBACK"), - settings, + cluster.MakeClusterSettings(), boundsReader, &spanconfig.TestingKnobs{ StoreIgnoreCoalesceAdjacentExceptions: true, From 5bf79409dd6ee8b1294aedfefe1d05d6a0891571 Mon Sep 17 00:00:00 2001 From: Tommy Reilly Date: Tue, 28 Mar 2023 18:28:12 -0400 Subject: [PATCH 2/2] sql: deflake and unskip TestTenantStatementTimeoutAdmissionQueueCancelation Make sure we get 4 blockers parked in the admission control queues and no more and retry the main query until it gets parked and booted from it, before we expected the main query to always reach the admission control q but that isn't reliable. Fixes: #78494 --- pkg/sql/BUILD.bazel | 1 + pkg/sql/run_control_test.go | 52 +++++++++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel index ed72f6450f65..6cda5a08bb10 100644 --- a/pkg/sql/BUILD.bazel +++ b/pkg/sql/BUILD.bazel @@ -871,6 +871,7 @@ go_test( "@com_github_jackc_pgx_v4//:pgx", "@com_github_lib_pq//:pq", "@com_github_lib_pq//oid", + "@com_github_petermattis_goid//:goid", "@com_github_pmezard_go_difflib//difflib", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", diff --git a/pkg/sql/run_control_test.go b/pkg/sql/run_control_test.go index 34aaea12f5f2..5f6e15bfbbc4 100644 --- a/pkg/sql/run_control_test.go +++ b/pkg/sql/run_control_test.go @@ -18,6 +18,7 @@ import ( "net/url" "strings" "sync" + "sync/atomic" "testing" "time" @@ -40,6 +41,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/petermattis/goid" "github.com/stretchr/testify/require" ) @@ -836,7 +838,6 @@ func getUserConn(t *testing.T, username string, server serverutils.TestServerInt // main statement with a timeout is blocked. func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { defer leaktest.AfterTest(t)() - skip.WithIssue(t, 78494, "flaky test") defer log.Scope(t).Close(t) skip.UnderStress(t, "times out under stress") @@ -845,8 +846,9 @@ func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() tenantID := serverutils.TestTenantID() - + var hitMainQuery uint64 numBlockers := 4 + var matches int64 // We can't get the tableID programmatically here, checked below with assert. const tableID = 104 @@ -865,9 +867,12 @@ func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { matchBatch := func(ctx context.Context, req *kvpb.BatchRequest) bool { tid, ok := roachpb.ClientTenantFromContext(ctx) if ok && tid == tenantID && len(req.Requests) > 0 { - scan, ok := req.Requests[0].GetInner().(*kvpb.ScanRequest) - if ok && tableSpan.ContainsKey(scan.Key) { - return true + scan, ok := req.Requests[0].GetInner().(*kvpb.GetRequest) + if ok { + if tableSpan.ContainsKey(scan.Key) { + log.Infof(ctx, "matchBatch %d", goid.Get()) + return true + } } } return false @@ -885,18 +890,30 @@ func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { Store: &kvserver.StoreTestingKnobs{ TestingRequestFilter: func(ctx context.Context, req *kvpb.BatchRequest) *kvpb.Error { if matchBatch(ctx, req) { + m := atomic.AddInt64(&matches, 1) + // If any of the blockers get retried just ignore. + if m > int64(numBlockers) { + log.Infof(ctx, "ignoring extra blocker %d", goid.Get()) + return nil + } // Notify we're blocking. + log.Infof(ctx, "blocking %d", goid.Get()) unblockClientCh <- struct{}{} <-qBlockersCh } return nil }, TestingResponseErrorEvent: func(ctx context.Context, req *kvpb.BatchRequest, err error) { - if matchBatch(ctx, req) { + tid, ok := roachpb.ClientTenantFromContext(ctx) + if ok && tid == tenantID && len(req.Requests) > 0 { scan, ok := req.Requests[0].GetInner().(*kvpb.ScanRequest) - if ok && tableSpan.ContainsKey(scan.Key) { - cancel() - wg.Done() + log.Infof(ctx, "%s %d", scan, goid.Get()) + if ok { + if tableSpan.ContainsKey(scan.Key) && atomic.CompareAndSwapUint64(&hitMainQuery, 0, 1) { + log.Infof(ctx, "got scan request error %d", goid.Get()) + cancel() + wg.Done() + } } } }, @@ -911,8 +928,8 @@ func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { defer db.Close() r1 := sqlutils.MakeSQLRunner(db) - r1.Exec(t, `CREATE TABLE foo (t int)`) - + r1.Exec(t, `SET CLUSTER SETTING sql.stats.automatic_collection.enabled=false`) + r1.Exec(t, `CREATE TABLE foo (t int PRIMARY KEY)`) row := r1.QueryRow(t, `SELECT id FROM system.namespace WHERE name = 'foo'`) var id int64 row.Scan(&id) @@ -931,18 +948,25 @@ func TestTenantStatementTimeoutAdmissionQueueCancelation(t *testing.T) { for _, r := range blockers { go func(r *sqlutils.SQLRunner) { defer wg.Done() - r.Exec(t, `SELECT * FROM foo`) + r.Exec(t, `SELECT * FROM foo WHERE t = 1234`) }(r) } // Wait till all blockers are parked. for i := 0; i < numBlockers; i++ { <-unblockClientCh } - client.ExpectErr(t, "timeout", `SELECT * FROM foo`) - // Unblock the blockers. + log.Infof(ctx, "blockers parked") + // Because we don't know when statement timeout will happen we have to repeat + // till we get one into the KV layer. + for atomic.LoadUint64(&hitMainQuery) == 0 { + _, err := client.DB.ExecContext(context.Background(), `SELECT * FROM foo`) + require.Error(t, err) + log.Infof(ctx, "main req finished: %v", err) + } for i := 0; i < numBlockers; i++ { qBlockersCh <- struct{}{} } + log.Infof(ctx, "unblocked blockers") wg.Wait() require.ErrorIs(t, ctx.Err(), context.Canceled) }