From a37e053173ebf069b12ef6a2c38a03dd984992e2 Mon Sep 17 00:00:00 2001 From: shralex Date: Tue, 7 Feb 2023 09:11:20 -0800 Subject: [PATCH 1/4] kv: increase the minimal max_range_size to 64MiB We've seen that small range sizes can be detrimental to various components. This PR makes it so users can't lower max_range_size below 64MiB (half of the default min_range_size), instead of 64KiB previously. Release Note: Small ranges have been known to cause problems in various CRDB subsystems. This PR prevents setting max_range_size below COCKROACH_MIN_RANGE_MAX_BYTES, an environment variable which defaults to 64MiB (half of the default minimum range size). Epic: https://cockroachlabs.atlassian.net/browse/CRDB-24182 Fixes: https://github.com/cockroachdb/cockroach/issues/96549 --- pkg/base/constants.go | 3 -- .../rttanalysis/alter_table_bench_test.go | 2 +- pkg/ccl/backupccl/backup_test.go | 4 +-- pkg/ccl/backupccl/utils_test.go | 17 ++++----- .../testdata/logic_test/distsql_partitioning | 10 +++--- .../logic_test/multi_region_zone_configs | 36 +++++++++---------- pkg/ccl/partitionccl/drop_test.go | 9 +++-- .../testdata/indexes | 10 +++--- .../testdata/tenant/indexes | 32 ++++++++--------- pkg/config/zonepb/BUILD.bazel | 2 +- pkg/config/zonepb/zone.go | 11 ++++-- pkg/config/zonepb/zone_test.go | 7 ++++ .../client_replica_backpressure_test.go | 30 ++++++++-------- pkg/kv/kvserver/client_split_test.go | 8 ++--- pkg/kv/kvserver/replicate_queue_test.go | 32 ++++++++++------- pkg/server/decommission_test.go | 12 +++---- .../logictest/testdata/logic_test/zone_config | 12 +++---- .../logic_test/zone_config_system_tenant | 6 ++-- pkg/sql/zone_config_test.go | 4 +-- 19 files changed, 132 insertions(+), 115 deletions(-) diff --git a/pkg/base/constants.go b/pkg/base/constants.go index e25f6f9d01f1..f59f8884d5ef 100644 --- a/pkg/base/constants.go +++ b/pkg/base/constants.go @@ -50,9 +50,6 @@ const ( // when a job opts in to dumping its execution traces. InflightTraceDir = "inflight_trace_dump" - // MinRangeMaxBytes is the minimum value for range max bytes. - MinRangeMaxBytes = 64 << 10 // 64 KB - // ObsServiceEmbedFlagValue is the special value of the --obsservice-addr flag // configuring the CRDB node to run the Obs Service internally. ObsServiceEmbedFlagValue = "embed" diff --git a/pkg/bench/rttanalysis/alter_table_bench_test.go b/pkg/bench/rttanalysis/alter_table_bench_test.go index 8fc7581e7706..1945853c3999 100644 --- a/pkg/bench/rttanalysis/alter_table_bench_test.go +++ b/pkg/bench/rttanalysis/alter_table_bench_test.go @@ -207,7 +207,7 @@ func init() { Name: "alter table configure zone ranges", Setup: `CREATE TABLE alter_table(a INT);`, Stmt: "ALTER TABLE alter_table CONFIGURE ZONE USING " + - "range_min_bytes = 0, range_max_bytes = 90000", + "range_min_bytes = 0, range_max_bytes = 500000000", }, }) } diff --git a/pkg/ccl/backupccl/backup_test.go b/pkg/ccl/backupccl/backup_test.go index 25da98979905..c6bc1fb5582f 100644 --- a/pkg/ccl/backupccl/backup_test.go +++ b/pkg/ccl/backupccl/backup_test.go @@ -9522,7 +9522,7 @@ func TestExportRequestBelowGCThresholdOnDataExcludedFromBackup(t *testing.T) { _, err = conn.Exec("SET CLUSTER SETTING kv.closed_timestamp.target_duration = '100ms'") // speeds up the test require.NoError(t, err) - const tableRangeMaxBytes = 1 << 18 + const tableRangeMaxBytes = 100 << 20 _, err = conn.Exec("ALTER TABLE foo CONFIGURE ZONE USING "+ "gc.ttlseconds = 1, range_max_bytes = $1, range_min_bytes = 1<<10;", tableRangeMaxBytes) require.NoError(t, err) @@ -9606,7 +9606,7 @@ func TestExcludeDataFromBackupDoesNotHoldupGC(t *testing.T) { // Exclude the table from backup so that it does not hold up GC. runner.Exec(t, `ALTER TABLE test.foo SET (exclude_data_from_backup = true)`) - const tableRangeMaxBytes = 1 << 18 + const tableRangeMaxBytes = 100 << 20 runner.Exec(t, "ALTER TABLE test.foo CONFIGURE ZONE USING "+ "gc.ttlseconds = 1, range_max_bytes = $1, range_min_bytes = 1<<10;", tableRangeMaxBytes) diff --git a/pkg/ccl/backupccl/utils_test.go b/pkg/ccl/backupccl/utils_test.go index 0136cbe36fbf..334f9d7d6587 100644 --- a/pkg/ccl/backupccl/utils_test.go +++ b/pkg/ccl/backupccl/utils_test.go @@ -52,6 +52,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/workload/workloadsql" "github.com/cockroachdb/errors" "github.com/cockroachdb/logtags" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -629,18 +630,14 @@ func upsertUntilBackpressure( t *testing.T, rRand *rand.Rand, conn *gosql.DB, database, table string, ) { t.Helper() - testutils.SucceedsSoon(t, func() error { + for i := 1; i < 50; i++ { _, err := conn.Exec(fmt.Sprintf("UPSERT INTO %s.%s VALUES (1, $1)", database, table), - randutil.RandBytes(rRand, 1<<15)) - if err == nil { - return errors.New("expected `backpressure` error") - } - - if !testutils.IsError(err, "backpressure") { - return errors.NewAssertionErrorWithWrappedErrf(err, "expected `backpressure` error") + randutil.RandBytes(rRand, 5<<20)) + if testutils.IsError(err, "backpressure") { + return } - return nil - }) + } + assert.Fail(t, "expected `backpressure` error") } // requireRecoveryEvent fetches all available log entries on disk after diff --git a/pkg/ccl/logictestccl/testdata/logic_test/distsql_partitioning b/pkg/ccl/logictestccl/testdata/logic_test/distsql_partitioning index eec2fe5d21ac..2bbbcc174922 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/distsql_partitioning +++ b/pkg/ccl/logictestccl/testdata/logic_test/distsql_partitioning @@ -345,13 +345,13 @@ statement ok ALTER INDEX partitioning.inheritance@inheritance_pkey PARTITION BY LIST (x) ( PARTITION p1 VALUES IN (1)) statement ok -ALTER DATABASE partitioning CONFIGURE ZONE USING range_min_bytes=64000, range_max_bytes=75000 +ALTER DATABASE partitioning CONFIGURE ZONE USING range_min_bytes=64000, range_max_bytes=75000000 query TTTTTTTTT SHOW PARTITIONS FROM TABLE partitioning.inheritance ---- partitioning inheritance p1 NULL x inheritance@inheritance_pkey (1) NULL range_min_bytes = 64000, - range_max_bytes = 75000, + range_max_bytes = 75000000, gc.ttlseconds = 14400, num_replicas = 3, constraints = '[]', @@ -364,7 +364,7 @@ query TTTTTTTTT SHOW PARTITIONS FROM TABLE partitioning.inheritance ---- partitioning inheritance p1 NULL x inheritance@inheritance_pkey (1) NULL range_min_bytes = 64000, -range_max_bytes = 75000, +range_max_bytes = 75000000, gc.ttlseconds = 80000, num_replicas = 3, constraints = '[]', @@ -377,7 +377,7 @@ query TTTTTTTTT SHOW PARTITIONS FROM TABLE partitioning.inheritance ---- partitioning inheritance p1 NULL x inheritance@inheritance_pkey (1) NULL range_min_bytes = 64000, -range_max_bytes = 75000, +range_max_bytes = 75000000, gc.ttlseconds = 80000, num_replicas = 5, constraints = '[]', @@ -390,7 +390,7 @@ query TTTTTTTTT SHOW PARTITIONS FROM TABLE partitioning.inheritance ---- partitioning inheritance p1 NULL x inheritance@inheritance_pkey (1) constraints = '[+dc=dc1]' range_min_bytes = 64000, -range_max_bytes = 75000, +range_max_bytes = 75000000, gc.ttlseconds = 80000, num_replicas = 5, constraints = '[+dc=dc1]', diff --git a/pkg/ccl/logictestccl/testdata/logic_test/multi_region_zone_configs b/pkg/ccl/logictestccl/testdata/logic_test/multi_region_zone_configs index c9d737a7f9f0..954c8992b2c0 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/multi_region_zone_configs +++ b/pkg/ccl/logictestccl/testdata/logic_test/multi_region_zone_configs @@ -123,14 +123,14 @@ statement ok ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING gc.ttlseconds = 5 statement ok -ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes = 100000 +ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes = 100000000 query TT SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 5, num_replicas = 5, num_voters = 3, @@ -149,7 +149,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 5, num_replicas = 5, num_voters = 3, @@ -170,7 +170,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 5, num_replicas = 5, num_voters = 5, @@ -243,7 +243,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 5, num_replicas = 5, num_voters = 3, @@ -302,7 +302,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 4, num_voters = 3, @@ -323,7 +323,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 3, num_voters = 3, @@ -344,7 +344,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 3, num_voters = 3, @@ -365,7 +365,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 3, constraints = '[]', @@ -443,7 +443,7 @@ SHOW ZONE CONFIGURATION FOR TABLE regional_by_row ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 10, num_replicas = 10, num_voters = 3, @@ -504,7 +504,7 @@ SHOW ZONE CONFIGURATION FOR INDEX regional_by_row@regional_by_row_pkey ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 10, num_replicas = 5, num_voters = 3, @@ -517,7 +517,7 @@ SHOW ZONE CONFIGURATION FOR INDEX regional_by_row@regional_by_row_i_idx ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 10, num_replicas = 5, num_voters = 3, @@ -530,7 +530,7 @@ SHOW ZONE CONFIGURATION FOR TABLE regional_by_row ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 10, num_replicas = 5, num_voters = 3, @@ -598,7 +598,7 @@ SHOW ZONE CONFIGURATION FOR TABLE regional_by_row ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, global_reads = false, num_replicas = 5, @@ -617,7 +617,7 @@ SHOW ZONE CONFIGURATION FOR TABLE regional_by_row ---- TABLE regional_by_row ALTER TABLE regional_by_row CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, global_reads = true, num_replicas = 5, @@ -667,7 +667,7 @@ SHOW ZONE CONFIGURATION FOR INDEX regional_by_row_as@regional_by_row_as_pkey ---- INDEX regional_by_row_as@regional_by_row_as_pkey ALTER INDEX regional_by_row_as@regional_by_row_as_pkey CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 10, num_voters = 3, @@ -688,7 +688,7 @@ SHOW ZONE CONFIGURATION FOR INDEX regional_by_row_as@regional_by_row_as_pkey ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 5, num_voters = 3, @@ -737,7 +737,7 @@ SHOW ZONE CONFIGURATION FOR DATABASE "mr-zone-configs" ---- DATABASE "mr-zone-configs" ALTER DATABASE "mr-zone-configs" CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 100000, num_replicas = 5, num_voters = 3, diff --git a/pkg/ccl/partitionccl/drop_test.go b/pkg/ccl/partitionccl/drop_test.go index 740e06e8c794..e3c591deaf41 100644 --- a/pkg/ccl/partitionccl/drop_test.go +++ b/pkg/ccl/partitionccl/drop_test.go @@ -217,7 +217,8 @@ SELECT job_id PARTITION c VALUES IN ('c') )`) tdb.Exec(t, `CREATE INDEX idx ON t (e)`) - tdb.Exec(t, `ALTER PARTITION a OF TABLE t CONFIGURE ZONE USING range_min_bytes = 123456, range_max_bytes = 654321`) + tdb.Exec(t, `ALTER PARTITION a OF TABLE t CONFIGURE ZONE USING range_min_bytes = 123456, +range_max_bytes = 654321000`) tdb.Exec(t, `ALTER INDEX t@idx CONFIGURE ZONE USING gc.ttlseconds = 1`) tdb.Exec(t, `DROP INDEX t@idx`) @@ -271,8 +272,10 @@ SELECT job_id PARTITION ci VALUES IN ('c') )`, ) - tdb.Exec(t, `ALTER PARTITION ai OF INDEX t@idx CONFIGURE ZONE USING range_min_bytes = 123456,range_max_bytes = 654321`) - tdb.Exec(t, `ALTER PARTITION a OF TABLE t CONFIGURE ZONE USING range_min_bytes = 123456, range_max_bytes = 654321`) + tdb.Exec(t, `ALTER PARTITION ai OF INDEX t@idx CONFIGURE ZONE USING range_min_bytes = 123456, +range_max_bytes = 654321000`) + tdb.Exec(t, `ALTER PARTITION a OF TABLE t CONFIGURE ZONE USING range_min_bytes = 123456, +range_max_bytes = 654321000`) tdb.Exec(t, `ALTER INDEX t@idx CONFIGURE ZONE USING gc.ttlseconds = 1`) tdb.Exec(t, `DROP INDEX t@idx`) tdb.Exec(t, `DROP TABLE t`) diff --git a/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/indexes b/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/indexes index 2b524704b882..774dd284d021 100644 --- a/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/indexes +++ b/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/indexes @@ -71,7 +71,7 @@ translate database=db table=t # Configure a zone config field on the table, so that it is no longer a # placeholder zone config. exec-sql -ALTER TABLE db.t CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes=100000; +ALTER TABLE db.t CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes=100000000; ---- query-sql @@ -79,7 +79,7 @@ SHOW ZONE CONFIGURATION FOR INDEX db.t@idx ---- INDEX db.public.t@idx ALTER INDEX db.public.t@idx CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 25, num_replicas = 7, num_voters = 5, @@ -89,6 +89,6 @@ INDEX db.public.t@idx ALTER INDEX db.public.t@idx CONFIGURE ZONE USING translate database=db table=t ---- -/Table/106{-/2} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Table/106/{2-3} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 -/Table/10{6/3-7} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Table/106{-/2} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Table/106/{2-3} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 +/Table/10{6/3-7} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 diff --git a/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/tenant/indexes b/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/tenant/indexes index 247a337dcffa..c973a2b3276a 100644 --- a/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/tenant/indexes +++ b/pkg/ccl/spanconfigccl/spanconfigsqltranslatorccl/testdata/tenant/indexes @@ -71,7 +71,7 @@ translate database=db table=t # Configure a zone config field on the table, so that it is no longer a # placeholder zone config. exec-sql -ALTER TABLE db.t CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes=100000; +ALTER TABLE db.t CONFIGURE ZONE USING range_min_bytes = 1000, range_max_bytes=100000000; ---- query-sql @@ -79,7 +79,7 @@ SHOW ZONE CONFIGURATION FOR INDEX db.t@idx ---- INDEX db.public.t@idx ALTER INDEX db.public.t@idx CONFIGURE ZONE USING range_min_bytes = 1000, - range_max_bytes = 100000, + range_max_bytes = 100000000, gc.ttlseconds = 25, num_replicas = 7, num_voters = 5, @@ -89,9 +89,9 @@ INDEX db.public.t@idx ALTER INDEX db.public.t@idx CONFIGURE ZONE USING translate database=db table=t ---- -/Tenant/10/Table/106{-/2} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Tenant/10/Table/106/{2-3} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 -/Tenant/10/Table/10{6/3-7} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106{-/2} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106/{2-3} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 +/Tenant/10/Table/10{6/3-7} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 block-gc-jobs ---- @@ -105,12 +105,12 @@ ALTER INDEX db.t@idx2 CONFIGURE ZONE USING gc.ttlseconds = 1; # Both the newly added index and the temporary index have the configured zone configuration. translate database=db table=t ---- -/Tenant/10/Table/106{-/2} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Tenant/10/Table/106/{2-3} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 -/Tenant/10/Table/106/{3-4} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Tenant/10/Table/106/{4-5} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 -/Tenant/10/Table/106/{5-6} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 -/Tenant/10/Table/10{6/6-7} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106{-/2} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106/{2-3} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 +/Tenant/10/Table/106/{3-4} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106/{4-5} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 +/Tenant/10/Table/106/{5-6} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 +/Tenant/10/Table/10{6/6-7} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 unblock-gc-jobs ---- @@ -122,11 +122,11 @@ SHOW JOBS WHEN COMPLETE (SELECT job_id FROM [SHOW JOBS]) # The zone configuration for the temporary index is cleaned up translate database=db table=t ---- -/Tenant/10/Table/106{-/2} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Tenant/10/Table/106/{2-3} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 -/Tenant/10/Table/106/{3-4} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 -/Tenant/10/Table/106/{4-5} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 -/Tenant/10/Table/10{6/5-7} range_max_bytes=100000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106{-/2} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106/{2-3} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=25 num_replicas=7 num_voters=5 +/Tenant/10/Table/106/{3-4} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 +/Tenant/10/Table/106/{4-5} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=1 num_replicas=7 +/Tenant/10/Table/10{6/5-7} range_max_bytes=100000000 range_min_bytes=1000 ttl_seconds=3600 num_replicas=7 # Create and drop an index inside the same transaction. The related # zone configuration should also be cleaned up. diff --git a/pkg/config/zonepb/BUILD.bazel b/pkg/config/zonepb/BUILD.bazel index bb3b39a137bd..27e4cb79300f 100644 --- a/pkg/config/zonepb/BUILD.bazel +++ b/pkg/config/zonepb/BUILD.bazel @@ -13,11 +13,11 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/config/zonepb", visibility = ["//visibility:public"], deps = [ - "//pkg/base", "//pkg/clusterversion", "//pkg/keys", "//pkg/roachpb", "//pkg/sql/sem/tree", + "//pkg/util/envutil", "//pkg/util/log", "@com_github_cockroachdb_errors//:errors", "@com_github_gogo_protobuf//proto", diff --git a/pkg/config/zonepb/zone.go b/pkg/config/zonepb/zone.go index b41ca403f1d1..91c2f71d378f 100644 --- a/pkg/config/zonepb/zone.go +++ b/pkg/config/zonepb/zone.go @@ -16,11 +16,11 @@ import ( "fmt" "strings" - "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" "github.com/gogo/protobuf/proto" @@ -341,6 +341,11 @@ func (z *ZoneConfig) ValidateTandemFields() error { return nil } +// MinRangeMaxBytes is the minimum value for range max bytes. +// The default, 64 MiB, is half of the default range_min_bytes +var minRangeMaxBytes = envutil.EnvOrDefaultInt64("COCKROACH_MIN_RANGE_MAX_BYTES", + 64<<20 /* 64 MiB */) + // Validate returns an error if the ZoneConfig specifies a known-dangerous or // disallowed configuration. func (z *ZoneConfig) Validate() error { @@ -382,9 +387,9 @@ func (z *ZoneConfig) Validate() error { } } - if z.RangeMaxBytes != nil && *z.RangeMaxBytes < base.MinRangeMaxBytes { + if z.RangeMaxBytes != nil && *z.RangeMaxBytes < minRangeMaxBytes { return fmt.Errorf("RangeMaxBytes %d less than minimum allowed %d", - *z.RangeMaxBytes, base.MinRangeMaxBytes) + *z.RangeMaxBytes, minRangeMaxBytes) } if z.RangeMinBytes != nil && *z.RangeMinBytes < 0 { diff --git a/pkg/config/zonepb/zone_test.go b/pkg/config/zonepb/zone_test.go index 3e59516d8654..9ce80265955a 100644 --- a/pkg/config/zonepb/zone_test.go +++ b/pkg/config/zonepb/zone_test.go @@ -61,6 +61,13 @@ func TestZoneConfigValidate(t *testing.T) { }, "RangeMaxBytes 0 less than minimum allowed", }, + { + ZoneConfig{ + NumReplicas: proto.Int32(1), + RangeMaxBytes: proto.Int64(60 << 20), + }, + "RangeMaxBytes 62914560 less than minimum allowed", + }, { ZoneConfig{ NumReplicas: proto.Int32(1), diff --git a/pkg/kv/kvserver/client_replica_backpressure_test.go b/pkg/kv/kvserver/client_replica_backpressure_test.go index a0c3b1c45a97..8a4dfb1cbd61 100644 --- a/pkg/kv/kvserver/client_replica_backpressure_test.go +++ b/pkg/kv/kvserver/client_replica_backpressure_test.go @@ -47,10 +47,12 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { // range size parameters. We want something not too tiny but also not too big // that it takes a while to load. const ( - rowSize = 16 << 10 // 16 KiB - dataSize = 512 << 10 // 512 KiB - numRows = dataSize / rowSize + rowSize = 5 << 20 // 5 MiB + dataSize = 200 << 20 // 200 MiB + numRows = dataSize / rowSize + min_range_max_bytes = 64 << 20 // 64 MiB ) + val := randutil.RandBytes(rRand, rowSize) // setup will set up a testcluster with a table filled with data. All splits // will be blocked until the returned closure is called. @@ -105,7 +107,7 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { for i := 0; i < numRows; i++ { tdb.Exec(t, "UPSERT INTO foo VALUES ($1, $2)", - rRand.Intn(numRows), randutil.RandBytes(rRand, rowSize)) + rRand.Intn(numRows), val) } // Block splits and return. @@ -177,19 +179,18 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { defer unblockSplits() tdb.Exec(t, "ALTER TABLE foo CONFIGURE ZONE USING "+ - "range_max_bytes = $1, range_min_bytes = $2", dataSize/5, dataSize/10) - waitForSpanConfig(t, tc, tablePrefix, dataSize/5) + "range_max_bytes = $1, range_min_bytes = $2", min_range_max_bytes, dataSize/10) + waitForSpanConfig(t, tc, tablePrefix, min_range_max_bytes) // Don't observe backpressure. tdb.Exec(t, "UPSERT INTO foo VALUES ($1, $2)", - rRand.Intn(10000000), randutil.RandBytes(rRand, rowSize)) + rRand.Intn(10000000), val) }) t.Run("no backpressure when much larger on new node", func(t *testing.T) { tc, args, tdb, tablePrefix, unblockSplits, _ := setup(t, 1) defer tc.Stopper().Stop(ctx) defer unblockSplits() - // We didn't want to have to load too much data into these ranges because // it makes the testing slower so let's lower the threshold at which we'll // consider the range to be way over the backpressure limit from megabytes @@ -197,15 +198,15 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { tdb.Exec(t, "SET CLUSTER SETTING kv.range.backpressure_byte_tolerance = '1 KiB'") tdb.Exec(t, "ALTER TABLE foo CONFIGURE ZONE USING "+ - "range_max_bytes = $1, range_min_bytes = $2", dataSize/5, dataSize/10) - waitForSpanConfig(t, tc, tablePrefix, dataSize/5) + "range_max_bytes = $1, range_min_bytes = $2", min_range_max_bytes, dataSize/10) + waitForSpanConfig(t, tc, tablePrefix, min_range_max_bytes) // Then we'll add a new server and move the table there. moveTableToNewStore(t, tc, args, tablePrefix) // Don't observe backpressure. tdb.Exec(t, "UPSERT INTO foo VALUES ($1, $2)", - rRand.Intn(10000000), randutil.RandBytes(rRand, rowSize)) + rRand.Intn(10000000), val) }) t.Run("no backpressure when near limit on existing node", func(t *testing.T) { @@ -224,7 +225,7 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { // backpressureByteTolerance. We won't see backpressure because the range // will remember its previous zone config setting. s, repl := getFirstStoreReplica(t, tc.Server(0), tablePrefix.Next()) - newMax := repl.GetMVCCStats().Total()/2 - 32<<10 + newMax := repl.GetMVCCStats().Total()/2 - 32<<20 newMin := newMax / 4 tdb.Exec(t, "ALTER TABLE foo CONFIGURE ZONE USING "+ "range_max_bytes = $1, range_min_bytes = $2", newMax, newMin) @@ -233,7 +234,7 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { // Don't observe backpressure because we remember the previous max size on // this node. tdb.Exec(t, "UPSERT INTO foo VALUES ($1, $2)", - rRand.Intn(10000000), randutil.RandBytes(rRand, rowSize)) + rRand.Intn(10000000), val) // Allow one split to occur and make sure that the remembered value is // cleared. @@ -247,6 +248,7 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { return nil }) }) + // This case is very similar to the above case but differs in that the range // is moved to a new node after the range size is decreased. This new node // never knew about the old, larger range size, and thus will backpressure @@ -260,7 +262,7 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) { // Now we'll change the range_max_bytes to be half the range size less a // bit. This is the range where we expect to observe backpressure. _, repl := getFirstStoreReplica(t, tc.Server(0), tablePrefix.Next()) - newMax := repl.GetMVCCStats().Total()/2 - 32<<10 + newMax := repl.GetMVCCStats().Total()/2 - 32<<20 newMin := newMax / 4 tdb.Exec(t, "ALTER TABLE foo CONFIGURE ZONE USING "+ "range_max_bytes = $1, range_min_bytes = $2", newMax, newMin) diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 94b2e3ddfc62..8e9df55fef0c 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -1042,10 +1042,10 @@ func fillRange( return } if key == nil || !singleKey { - key = append(append([]byte(nil), prefix...), randutil.RandBytes(src, 100)...) + key = append(append([]byte(nil), prefix...), randutil.RandBytes(src, 1000)...) key = keys.MakeFamilyKey(key, src.Uint32()) } - val := randutil.RandBytes(src, int(src.Int31n(1<<8))) + val := randutil.RandBytes(src, 200000) pArgs := putArgs(key, val) _, pErr := kv.SendWrappedWith(context.Background(), store, kvpb.Header{ RangeID: rangeID, @@ -1088,7 +1088,7 @@ func TestStoreZoneUpdateAndRangeSplit(t *testing.T) { tdb.Exec(t, "CREATE TABLE t ()") var descID uint32 tdb.QueryRow(t, "SELECT 't'::regclass::int").Scan(&descID) - const maxBytes, minBytes = 1 << 16, 1 << 14 + const maxBytes, minBytes = 100 << 20, 1 << 14 tdb.Exec(t, "ALTER TABLE t CONFIGURE ZONE USING range_max_bytes = $1, range_min_bytes = $2", maxBytes, minBytes) @@ -1164,7 +1164,7 @@ func TestStoreRangeSplitWithMaxBytesUpdate(t *testing.T) { tdb.Exec(t, "CREATE TABLE t ()") var descID uint32 tdb.QueryRow(t, "SELECT 't'::regclass::int").Scan(&descID) - const maxBytes, minBytes = 1 << 16, 1 << 14 + const maxBytes, minBytes = 100 << 20, 1 << 14 tdb.Exec(t, "ALTER TABLE t CONFIGURE ZONE USING range_max_bytes = $1, range_min_bytes = $2", maxBytes, minBytes) diff --git a/pkg/kv/kvserver/replicate_queue_test.go b/pkg/kv/kvserver/replicate_queue_test.go index c8db98cfa3a6..ac94a959237e 100644 --- a/pkg/kv/kvserver/replicate_queue_test.go +++ b/pkg/kv/kvserver/replicate_queue_test.go @@ -1757,8 +1757,7 @@ func TestLargeUnsplittableRangeReplicate(t *testing.T) { skip.UnderDeadlockWithIssue(t, 38565) ctx := context.Background() - // Create a cluster with really small ranges. - const rangeMaxSize = base.MinRangeMaxBytes + const rangeMaxSize = 64 << 20 zcfg := zonepb.DefaultZoneConfig() zcfg.RangeMinBytes = proto.Int64(rangeMaxSize / 2) zcfg.RangeMaxBytes = proto.Int64(rangeMaxSize) @@ -1778,12 +1777,13 @@ func TestLargeUnsplittableRangeReplicate(t *testing.T) { ) defer tc.Stopper().Stop(ctx) - // We're going to create a table with a big row and a small row. We'll split - // the table in between the rows, to produce a large range and a small one. - // Then we'll increase the replication factor to 5 and check that both ranges - // behave the same - i.e. they both get up-replicated. For the purposes of - // this test we're only worried about the large one up-replicating, but we - // test the small one as a control so that we don't fool ourselves. + // We're going to create a table with many versions of a big row and a small + // row. We'll split the table in between the rows, to produce a large range + // and a small one. Then we'll increase the replication factor to 5 and check + // that both ranges behave the same - i.e. they both get up-replicated. For + // the purposes of this test we're only worried about the large one + // up-replicating, but we test the small one as a control so that we don't + // fool ourselves. // Disable the queues so they don't mess with our manual relocation. We'll // re-enable them later. @@ -1802,14 +1802,20 @@ func TestLargeUnsplittableRangeReplicate(t *testing.T) { toggleReplicationQueues(tc, true /* active */) toggleSplitQueues(tc, true /* active */) - // We're going to create a row that's larger than range_max_bytes, but not - // large enough that write back-pressuring kicks in and refuses it. + // We're going to create a large row, but now large enough that write + // back-pressuring kicks in and refuses it. var sb strings.Builder - for i := 0; i < 1.5*rangeMaxSize; i++ { + for i := 0; i < rangeMaxSize/8; i++ { sb.WriteRune('a') } - _, err = db.Exec("INSERT INTO t(i,s) VALUES (1, $1)", sb.String()) - require.NoError(t, err) + + // Write 16 versions of the same row. This way the range won't be able to split. + for i := 0; i < 16; i++ { + _, err = db.Exec("UPSERT INTO t(i,s) VALUES (1, $1)", sb.String()) + require.NoError(t, err) + } + + // Write a small row into the second range. _, err = db.Exec("INSERT INTO t(i,s) VALUES (2, 'b')") require.NoError(t, err) diff --git a/pkg/server/decommission_test.go b/pkg/server/decommission_test.go index 005d7769d12d..f8b37cfd7126 100644 --- a/pkg/server/decommission_test.go +++ b/pkg/server/decommission_test.go @@ -120,9 +120,9 @@ func TestDecommissionPreCheckEvaluation(t *testing.T) { runQueries(setupQueries...) alterQueries := []string{ "ALTER TABLE test.tblA CONFIGURE ZONE USING num_replicas = 3, constraints = '{+west: 1, +central: 1, +east: 1}', " + - "range_max_bytes = 500000, range_min_bytes = 100", + "range_max_bytes = 500000000, range_min_bytes = 100", "ALTER TABLE test.tblB CONFIGURE ZONE USING num_replicas = 3, constraints = '{+east}', " + - "range_max_bytes = 500000, range_min_bytes = 100", + "range_max_bytes = 500000000, range_min_bytes = 100", } runQueries(alterQueries...) tblAID, err := firstSvr.admin.queryTableID(ctx, username.RootUserName(), "test", "tblA") @@ -139,8 +139,8 @@ func TestDecommissionPreCheckEvaluation(t *testing.T) { require.NoError(t, err) // Ensure all nodes have the correct span configs for tblA and tblB. - waitForSpanConfig(t, tc, rDescA.StartKey, 500000) - waitForSpanConfig(t, tc, rDescB.StartKey, 500000) + waitForSpanConfig(t, tc, rDescA.StartKey, 500000000) + waitForSpanConfig(t, tc, rDescB.StartKey, 500000000) // Transfer tblA to [west, central, east] and tblB to [east]. tc.AddVotersOrFatal(t, startKeyTblA, tc.Target(1), tc.Target(2), tc.Target(4)) @@ -229,7 +229,7 @@ func TestDecommissionPreCheckOddToEven(t *testing.T) { runQueries(setupQueries...) alterQueries := []string{ "ALTER TABLE test.tblA CONFIGURE ZONE USING num_replicas = 5, " + - "range_max_bytes = 500000, range_min_bytes = 100", + "range_max_bytes = 500000000, range_min_bytes = 100", } runQueries(alterQueries...) tblAID, err := firstSvr.admin.queryTableID(ctx, username.RootUserName(), "test", "tblA") @@ -241,7 +241,7 @@ func TestDecommissionPreCheckOddToEven(t *testing.T) { require.NoError(t, err) // Ensure all nodes have the correct span configs for tblA. - waitForSpanConfig(t, tc, rDescA.StartKey, 500000) + waitForSpanConfig(t, tc, rDescA.StartKey, 500000000) // Transfer tblA to all nodes. tc.AddVotersOrFatal(t, startKeyTblA, tc.Target(1), tc.Target(2), tc.Target(3), tc.Target(4)) diff --git a/pkg/sql/logictest/testdata/logic_test/zone_config b/pkg/sql/logictest/testdata/logic_test/zone_config index 6e185ce456ee..5f3dce3673ad 100644 --- a/pkg/sql/logictest/testdata/logic_test/zone_config +++ b/pkg/sql/logictest/testdata/logic_test/zone_config @@ -75,7 +75,7 @@ SELECT zone_id, raw_config_sql FROM [SHOW ZONE CONFIGURATION FOR TABLE a] statement ok ALTER TABLE a CONFIGURE ZONE USING range_min_bytes = 200000 + 1, - range_max_bytes = 300000 + 1, + range_max_bytes = 100000000 + 1, gc.ttlseconds = 3000 + 600, num_replicas = floor(1.2)::int, constraints = '[+region=test]', @@ -103,7 +103,7 @@ SELECT zone_id, raw_config_sql FROM [SHOW ZONE CONFIGURATION FOR TABLE a] ---- 106 ALTER TABLE a CONFIGURE ZONE USING range_min_bytes = 200001, - range_max_bytes = 300001, + range_max_bytes = 100000001, gc.ttlseconds = 3600, num_replicas = 1, constraints = '[+region=test]', @@ -111,14 +111,14 @@ SELECT zone_id, raw_config_sql FROM [SHOW ZONE CONFIGURATION FOR TABLE a] # Check that we can set just one value without altering the others. statement ok -ALTER TABLE a CONFIGURE ZONE USING range_max_bytes = 400000 +ALTER TABLE a CONFIGURE ZONE USING range_max_bytes = 400000000 query IT SELECT zone_id, raw_config_sql FROM [SHOW ZONE CONFIGURATION FOR TABLE a] ---- 106 ALTER TABLE a CONFIGURE ZONE USING range_min_bytes = 200001, - range_max_bytes = 400000, + range_max_bytes = 400000000, gc.ttlseconds = 3600, num_replicas = 1, constraints = '[+region=test]', @@ -154,7 +154,7 @@ SELECT zone_id, raw_config_sql FROM [SHOW ZONE CONFIGURATION FOR TABLE a] ---- 106 ALTER TABLE a CONFIGURE ZONE USING range_min_bytes = 200001, - range_max_bytes = 400000, + range_max_bytes = 400000000, gc.ttlseconds = 3600, num_replicas = 1, constraints = '[+region=test]', @@ -170,7 +170,7 @@ a CREATE TABLE public.a ( ); ALTER TABLE test.public.a CONFIGURE ZONE USING range_min_bytes = 200001, - range_max_bytes = 400000, + range_max_bytes = 400000000, gc.ttlseconds = 3600, num_replicas = 1, constraints = '[+region=test]', diff --git a/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant b/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant index 4b421056f8e0..7d09026e5155 100644 --- a/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant +++ b/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant @@ -95,7 +95,7 @@ SELECT 'db2.t'::REGCLASS::INT # in either of these tables for id = $t_id. statement ok BEGIN; -ALTER TABLE db2.t CONFIGURE ZONE USING range_max_bytes = 1<<24, range_min_bytes = 1<<20; +ALTER TABLE db2.t CONFIGURE ZONE USING range_max_bytes = 64<<20, range_min_bytes = 1<<20; DROP TABLE db2.t; COMMIT; @@ -105,7 +105,7 @@ SELECT crdb_internal.pb_to_json('cockroach.roachpb.SpanConfig', config) FROM system.span_configurations WHERE end_key > (SELECT crdb_internal.table_span($t_id)[1]) ---- -{"gcPolicy": {"ttlSeconds": 14400}, "numReplicas": 3, "rangeMaxBytes": "16777216", "rangeMinBytes": "1048576"} +{"gcPolicy": {"ttlSeconds": 14400}, "numReplicas": 3, "rangeMaxBytes": "67108864", "rangeMinBytes": "1048576"} statement ok CREATE TABLE db2.t2 (i INT PRIMARY KEY); @@ -123,7 +123,7 @@ SELECT crdb_internal.pb_to_json('cockroach.roachpb.SpanConfig', config) FROM system.span_configurations WHERE end_key > (SELECT crdb_internal.table_span($t_id)[1]) ---- -{"gcPolicy": {"ttlSeconds": 90001}, "numReplicas": 3, "rangeMaxBytes": "16777216", "rangeMinBytes": "1048576"} +{"gcPolicy": {"ttlSeconds": 90001}, "numReplicas": 3, "rangeMaxBytes": "67108864", "rangeMinBytes": "1048576"} {"gcPolicy": {"ttlSeconds": 90001}, "numReplicas": 3, "rangeMaxBytes": "1073741824", "rangeMinBytes": "67108864"} # Check that dropped relations can have their GC TTLs altered. diff --git a/pkg/sql/zone_config_test.go b/pkg/sql/zone_config_test.go index 521c68106684..a303f7f67006 100644 --- a/pkg/sql/zone_config_test.go +++ b/pkg/sql/zone_config_test.go @@ -95,7 +95,7 @@ func TestGetZoneConfig(t *testing.T) { defaultZoneConfig := zonepb.DefaultSystemZoneConfig() defaultZoneConfig.NumReplicas = proto.Int32(1) defaultZoneConfig.RangeMinBytes = proto.Int64(1 << 20) - defaultZoneConfig.RangeMaxBytes = proto.Int64(1 << 21) + defaultZoneConfig.RangeMaxBytes = proto.Int64(100 << 20) defaultZoneConfig.GC = &zonepb.GCPolicy{TTLSeconds: 60} require.NoError(t, defaultZoneConfig.Validate()) params.Knobs.Server = &server.TestingKnobs{ @@ -331,7 +331,7 @@ func TestCascadingZoneConfig(t *testing.T) { defaultZoneConfig := zonepb.DefaultZoneConfig() defaultZoneConfig.NumReplicas = proto.Int32(1) defaultZoneConfig.RangeMinBytes = proto.Int64(1 << 20) - defaultZoneConfig.RangeMaxBytes = proto.Int64(1 << 21) + defaultZoneConfig.RangeMaxBytes = proto.Int64(100 << 20) defaultZoneConfig.GC = &zonepb.GCPolicy{TTLSeconds: 60} require.NoError(t, defaultZoneConfig.Validate()) params.Knobs.Server = &server.TestingKnobs{ From 3950caba0de46996eff2acdfc391e8232376a5e2 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Thu, 23 Mar 2023 09:51:17 -0400 Subject: [PATCH 2/4] roachtest: lower default timeout The previous default of 10 hours if unnecessarily long. As a consequence, if a test has a bug that causes it to hang, it will take 10h's worth of cluster usage, even if it's a test that generally succeeds in half an hour. This should hopefully help with the timeouts we have been seeing in roachtest nightly runs. Epic: none Release note: None --- pkg/cmd/roachtest/test_runner.go | 2 +- pkg/cmd/roachtest/tests/activerecord.go | 2 ++ pkg/cmd/roachtest/tests/multitenant_upgrade.go | 1 + pkg/cmd/roachtest/tests/ruby_pg.go | 2 ++ pkg/cmd/roachtest/tests/tpcc.go | 2 ++ pkg/cmd/roachtest/tests/tpce.go | 2 +- pkg/cmd/roachtest/tests/version.go | 1 + 7 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index a5fcd8915e0b..014214a33c0e 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -947,7 +947,7 @@ func (r *testRunner) runTest( t.start = timeutil.Now() - timeout := 10 * time.Hour + timeout := 3 * time.Hour if d := s.Timeout; d != 0 { timeout = d } diff --git a/pkg/cmd/roachtest/tests/activerecord.go b/pkg/cmd/roachtest/tests/activerecord.go index bbf0af03f363..950b7a9d2b57 100644 --- a/pkg/cmd/roachtest/tests/activerecord.go +++ b/pkg/cmd/roachtest/tests/activerecord.go @@ -16,6 +16,7 @@ import ( "context" "fmt" "regexp" + "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -243,6 +244,7 @@ func registerActiveRecord(r registry.Registry) { r.Add(registry.TestSpec{ Name: "activerecord", Owner: registry.OwnerSQLSessions, + Timeout: 5 * time.Hour, Cluster: r.MakeClusterSpec(1), NativeLibs: registry.LibGEOS, Tags: []string{`default`, `orm`}, diff --git a/pkg/cmd/roachtest/tests/multitenant_upgrade.go b/pkg/cmd/roachtest/tests/multitenant_upgrade.go index aac99172f946..c3dcc090cc2c 100644 --- a/pkg/cmd/roachtest/tests/multitenant_upgrade.go +++ b/pkg/cmd/roachtest/tests/multitenant_upgrade.go @@ -30,6 +30,7 @@ import ( func registerMultiTenantUpgrade(r registry.Registry) { r.Add(registry.TestSpec{ Name: "multitenant-upgrade", + Timeout: 1 * time.Hour, Cluster: r.MakeClusterSpec(2), Owner: registry.OwnerMultiTenant, NonReleaseBlocker: false, diff --git a/pkg/cmd/roachtest/tests/ruby_pg.go b/pkg/cmd/roachtest/tests/ruby_pg.go index 58924dd99eff..4b5c0d44c3ef 100644 --- a/pkg/cmd/roachtest/tests/ruby_pg.go +++ b/pkg/cmd/roachtest/tests/ruby_pg.go @@ -17,6 +17,7 @@ import ( "fmt" "regexp" "strconv" + "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -227,6 +228,7 @@ func registerRubyPG(r registry.Registry) { r.Add(registry.TestSpec{ Name: "ruby-pg", + Timeout: 1 * time.Hour, Owner: registry.OwnerSQLSessions, Cluster: r.MakeClusterSpec(1), NativeLibs: registry.LibGEOS, diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 9d9650804bde..1fd290e70115 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -536,6 +536,7 @@ func registerTPCC(r registry.Registry) { r.Add(registry.TestSpec{ // run the same mixed-headroom test, but going back two versions Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomSpec.String(), + Timeout: 5 * time.Hour, Owner: registry.OwnerTestEng, Tags: []string{`default`}, Cluster: mixedHeadroomSpec, @@ -1083,6 +1084,7 @@ func registerTPCCBenchSpec(r registry.Registry, b tpccBenchSpec) { Name: name, Owner: owner, Cluster: nodes, + Timeout: 5 * time.Hour, Tags: b.Tags, EncryptionSupport: encryptionSupport, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { diff --git a/pkg/cmd/roachtest/tests/tpce.go b/pkg/cmd/roachtest/tests/tpce.go index 5b04457da2f4..aa4cf04703ef 100644 --- a/pkg/cmd/roachtest/tests/tpce.go +++ b/pkg/cmd/roachtest/tests/tpce.go @@ -108,7 +108,7 @@ func registerTPCE(r registry.Registry) { for _, opts := range []tpceOptions{ // Nightly, small scale configurations. - {customers: 5_000, nodes: 3, cpus: 4, ssds: 1}, + {customers: 5_000, nodes: 3, cpus: 4, ssds: 1, timeout: 4 * time.Hour}, // Weekly, large scale configurations. {customers: 100_000, nodes: 5, cpus: 32, ssds: 2, tags: []string{"weekly"}, timeout: 36 * time.Hour}, } { diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 627094ab7bdc..940c5f3125f2 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -219,6 +219,7 @@ func registerVersion(r registry.Registry) { for _, n := range []int{3, 5} { r.Add(registry.TestSpec{ Name: fmt.Sprintf("version/mixed/nodes=%d", n), + Timeout: 4 * time.Hour, Owner: registry.OwnerTestEng, Cluster: r.MakeClusterSpec(n + 1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { From b7d6f75a803217061fc3e4dd32a7439b32acc131 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Wed, 22 Mar 2023 15:48:55 -0400 Subject: [PATCH 3/4] sql/keys introduce key decoders that do not rely on a codec The codec normally used to decode and encode crdb keys requires a tenant prefix upon creation. For callers that want to manipulate keys across tenants, the per tenant codec provides a poor experience. This patch provides a few helper functions to strip key prefixes without a codec. Informs #98570 Release note: None --- pkg/keys/sql.go | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/pkg/keys/sql.go b/pkg/keys/sql.go index 750b764ad07a..9bc5489530a0 100644 --- a/pkg/keys/sql.go +++ b/pkg/keys/sql.go @@ -65,6 +65,54 @@ func DecodeTenantPrefixE(key roachpb.Key) ([]byte, roachpb.TenantID, error) { return rem, id, nil } +// StripTenantPrefix removes the tenant prefix from the provided key. This +// function should be used instead of sqlDecoder.StripTenantPrefix, if the user +// cannot instantiate a codec that operates on a single tenant. +func StripTenantPrefix(key roachpb.Key) ([]byte, error) { + var err error + if len(key) == 0 { // key.Equal(roachpb.RKeyMin) + return nil, nil + } + if key[0] != tenantPrefixByte { + return key, nil + } + key, _, err = encoding.DecodeUvarintAscending(key[1:]) + return key, err +} + +// StripTablePrefix removes the table and tenant prefix from the provided key. +// This function should be used instead of sqlDecoder.DecodeTablePrefix, if the +// user cannot instantiate a codec that operates on a single tenant. +func StripTablePrefix(key roachpb.Key) ([]byte, error) { + var err error + key, err = StripTenantPrefix(key) + if err != nil { + return nil, err + } + if encoding.PeekType(key) != encoding.Int { + return nil, errors.Errorf("invalid table key prefix: %q", key) + } + key, _, err = encoding.DecodeUvarintAscending(key) + return key, err +} + +// StripIndexPrefix removes the index, table and tenant prefix from the provided +// key. This function should be used instead of sqlDecoder.DecodeIndexPrefix, if +// the user cannot instantiate a codec that operates on a single tenant. +func StripIndexPrefix(key roachpb.Key) ([]byte, error) { + var err error + // StripTablePrefix automatically removes the tenant prefix as well. + key, err = StripTablePrefix(key) + if err != nil { + return nil, err + } + if encoding.PeekType(key) != encoding.Int { + return nil, errors.Errorf("invalid index key prefix: %q", key) + } + key, _, err = encoding.DecodeUvarintAscending(key) + return key, err +} + // SQLCodec provides methods for encoding SQL table keys bound to a given // tenant. The generator also provides methods for efficiently decoding keys // previously generated by it. The generated keys are safe to use indefinitely From cf912029095dc4ae105a6635b1770e27b46e4ff9 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Tue, 14 Mar 2023 12:52:25 -0400 Subject: [PATCH 4/4] backupccl: create stripped crdb_internal.fingerprint overload This patch adds a variant of crdb_internal.fingerprint(), which creates a "stripped" fingerprint of the target span. Namely, `crdb_internal.fingerpint(span,true)` will return a fingerprint that is agnostic to the mvcc timestamps and the index prefix of the key, and considers only the latest mvcc history of the key span. This cmd should only get used over user table space, i.e. on keys with an index prefix. For example, suppose the user fingerprinted a table at some system time, then backed up and restored it to that same system time. The restored table should have the same fingerprint! crdb_internal.fingerpint(span,false) is equivalent to crdb_internal.fingerpint(span,NULL,LATEST). This fingerprint variant is signicantly more scalable than SHOW EXPERIMENTAL FINGERPRINT, as it uses export requests compared to a simple scan. Fixes #98570 Release note: None --- docs/generated/sql/functions.md | 2 + pkg/ccl/backupccl/backup_test.go | 7 +- pkg/ccl/backupccl/backuprand/BUILD.bazel | 1 + .../backupccl/backuprand/backup_rand_test.go | 9 +- pkg/kv/kvpb/api.proto | 8 +- pkg/kv/kvserver/batcheval/cmd_export.go | 19 +- pkg/sql/sem/builtins/BUILD.bazel | 1 + pkg/sql/sem/builtins/builtins.go | 191 +++++------------- .../sem/builtins/fingerprint_builtin_test.go | 21 ++ pkg/sql/sem/builtins/fingerprint_builtins.go | 174 ++++++++++++++++ pkg/sql/sem/builtins/fixed_oids.go | 1 + pkg/storage/fingerprint_writer.go | 32 ++- pkg/storage/mvcc.go | 4 + pkg/storage/mvcc_history_test.go | 5 +- .../export_fingerprint_no_timestamps | 77 +++++++ pkg/testutils/sqlutils/BUILD.bazel | 1 + pkg/testutils/sqlutils/fingerprint.go | 30 +++ 17 files changed, 427 insertions(+), 156 deletions(-) create mode 100644 pkg/sql/sem/builtins/fingerprint_builtins.go create mode 100644 pkg/storage/testdata/mvcc_histories/export_fingerprint_no_timestamps create mode 100644 pkg/testutils/sqlutils/fingerprint.go diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index e4a8cdc76c12..ef43c0f04b9e 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -3127,6 +3127,8 @@ may increase either contention or retry errors, or both.

Stable crdb_internal.fingerprint(span: bytes[], start_time: timestamptz, all_revisions: bool) → int

This function is used only by CockroachDB’s developers for testing purposes.

Stable +crdb_internal.fingerprint(span: bytes[], stripped: bool) → int

This function is used only by CockroachDB’s developers for testing purposes.

+
Stable crdb_internal.force_assertion_error(msg: string) → int

This function is used only by CockroachDB’s developers for testing purposes.

Volatile crdb_internal.force_error(errorCode: string, msg: string) → int

This function is used only by CockroachDB’s developers for testing purposes.

diff --git a/pkg/ccl/backupccl/backup_test.go b/pkg/ccl/backupccl/backup_test.go index 4354b7d7db06..52300810646b 100644 --- a/pkg/ccl/backupccl/backup_test.go +++ b/pkg/ccl/backupccl/backup_test.go @@ -1080,6 +1080,8 @@ SELECT payload FROM "".crdb_internal.system_jobs ORDER BY created DESC LIMIT 10 sqlDB.Exec(t, incBackupQuery, queryArgs...) } + bankTableID := sqlutils.QueryTableID(t, conn, "data", "public", "bank") + backupTableFingerprint := sqlutils.FingerprintTable(t, sqlDB, bankTableID) sqlDB.Exec(t, `DROP DATABASE data CASCADE`) @@ -1100,7 +1102,7 @@ SELECT payload FROM "".crdb_internal.system_jobs ORDER BY created DESC LIMIT 10 restoreQuery = fmt.Sprintf("%s WITH kms = %s", restoreQuery, kmsURIFmtString) } queryArgs := append(restoreURIArgs, kmsURIArgs...) - verifyRestoreData(t, sqlDB, storageSQLDB, restoreQuery, queryArgs, numAccounts) + verifyRestoreData(t, sqlDB, storageSQLDB, restoreQuery, queryArgs, numAccounts, backupTableFingerprint) } func verifyRestoreData( @@ -1110,6 +1112,7 @@ func verifyRestoreData( restoreQuery string, restoreURIArgs []interface{}, numAccounts int, + bankStrippedFingerprint int, ) { var unused string var restored struct { @@ -1160,6 +1163,8 @@ func verifyRestoreData( t.Fatal("unexpected span start at primary index") } } + restorebankID := sqlutils.QueryTableID(t, sqlDB.DB, "data", "public", "bank") + require.Equal(t, bankStrippedFingerprint, sqlutils.FingerprintTable(t, sqlDB, restorebankID)) } func TestBackupRestoreSystemTables(t *testing.T) { diff --git a/pkg/ccl/backupccl/backuprand/BUILD.bazel b/pkg/ccl/backupccl/backuprand/BUILD.bazel index 5c782131328f..7a1597d20f90 100644 --- a/pkg/ccl/backupccl/backuprand/BUILD.bazel +++ b/pkg/ccl/backupccl/backuprand/BUILD.bazel @@ -30,6 +30,7 @@ go_test( "//pkg/util/log", "//pkg/util/randutil", "@com_github_stretchr_testify//assert", + "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/ccl/backupccl/backuprand/backup_rand_test.go b/pkg/ccl/backupccl/backuprand/backup_rand_test.go index 40f684ea111b..9da5923a656d 100644 --- a/pkg/ccl/backupccl/backuprand/backup_rand_test.go +++ b/pkg/ccl/backupccl/backuprand/backup_rand_test.go @@ -28,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // TestBackupRestoreRandomDataRoundtrips conducts backup/restore roundtrips on @@ -97,12 +98,13 @@ database_name = 'rand' AND schema_name = 'public'`) } expectedCreateTableStmt := make(map[string]string) - expectedData := make(map[string][][]string) + expectedData := make(map[string]int) for _, tableName := range tableNames { expectedCreateTableStmt[tableName] = sqlDB.QueryStr(t, fmt.Sprintf(`SELECT create_statement FROM [SHOW CREATE TABLE %s]`, tree.NameString(tableName)))[0][0] if runSchemaOnlyExtension == "" { - expectedData[tableName] = sqlDB.QueryStr(t, fmt.Sprintf(`SELECT * FROM %s`, tree.NameString(tableName))) + tableID := sqlutils.QueryTableID(t, sqlDB.DB, "rand", "public", tableName) + expectedData[tableName] = sqlutils.FingerprintTable(t, sqlDB, tableID) } } @@ -135,7 +137,8 @@ database_name = 'rand' AND schema_name = 'public'`) assert.Equal(t, expectedCreateTableStmt[tableName], createStmt, "SHOW CREATE %s not equal after RESTORE", tableName) if runSchemaOnlyExtension == "" { - sqlDB.CheckQueryResults(t, fmt.Sprintf(`SELECT * FROM %s`, restoreTable), expectedData[tableName]) + tableID := sqlutils.QueryTableID(t, sqlDB.DB, "restoredb", "public", tableName) + require.Equal(t, expectedData[tableName], sqlutils.FingerprintTable(t, sqlDB, tableID)) } else { sqlDB.CheckQueryResults(t, fmt.Sprintf(`SELECT count(*) FROM %s`, restoreTable), [][]string{{"0"}}) diff --git a/pkg/kv/kvpb/api.proto b/pkg/kv/kvpb/api.proto index 303b60458029..625c3fff8cee 100644 --- a/pkg/kv/kvpb/api.proto +++ b/pkg/kv/kvpb/api.proto @@ -1696,7 +1696,13 @@ message ExportRequest { reserved 2, 5, 7, 8, 11; - // Next ID: 15 + FingerprintOptions fingerprint_options = 15 [(gogoproto.nullable) = false]; + + // Next ID: 16 +} + +message FingerprintOptions { + bool strip_index_prefix_and_timestamp = 1; } // BulkOpSummary summarizes the data processed by an operation, counting the diff --git a/pkg/kv/kvserver/batcheval/cmd_export.go b/pkg/kv/kvserver/batcheval/cmd_export.go index 25713c2f7076..a435e4e3d941 100644 --- a/pkg/kv/kvserver/batcheval/cmd_export.go +++ b/pkg/kv/kvserver/batcheval/cmd_export.go @@ -194,8 +194,23 @@ func evalExport( // values before fingerprinting so that the fingerprint is tenant // agnostic. opts.FingerprintOptions = storage.MVCCExportFingerprintOptions{ - StripTenantPrefix: true, - StripValueChecksum: true, + StripTenantPrefix: true, + StripValueChecksum: true, + StripIndexPrefixAndTimestamp: args.FingerprintOptions.StripIndexPrefixAndTimestamp, + } + if opts.FingerprintOptions.StripIndexPrefixAndTimestamp && args.MVCCFilter == kvpb.MVCCFilter_All { + // If a key's value were updated from a to b, the xor hash without + // timestamps of those two mvcc values would look the same if the key + // were updated from b to a. In other words, the order of key value + // updates without timestamps does not affect the xor hash; but this + // order clearly presents different mvcc history, therefore, do not + // strip timestamps if fingerprinting all mvcc history. + return result.Result{}, errors.New("cannot fingerprint without mvcc timestamps and with mvcc history") + } + if opts.FingerprintOptions.StripIndexPrefixAndTimestamp && !args.StartTime.IsEmpty() { + // Supplying a startKey only complicates results (e.g. it surfaces + // tombstones), given that only the latest keys are surfaced. + return result.Result{}, errors.New("cannot fingerprint without mvcc timestamps and with a start time") } var hasRangeKeys bool summary, resumeInfo, fingerprint, hasRangeKeys, err = storage.MVCCExportFingerprint(ctx, diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel index 638635796fd1..7ddf32853374 100644 --- a/pkg/sql/sem/builtins/BUILD.bazel +++ b/pkg/sql/sem/builtins/BUILD.bazel @@ -9,6 +9,7 @@ go_library( "all_builtins.go", "builtins.go", "compression.go", + "fingerprint_builtins.go", "fixed_oids.go", "generator_builtins.go", "generator_probe_ranges.go", diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index f7308fec7ef4..0442e386b476 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -42,7 +42,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" - "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/password" @@ -77,10 +76,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil" "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" "github.com/cockroachdb/cockroach/pkg/sql/types" - "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util" - "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" - "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/cockroach/pkg/util/duration" "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented" @@ -7524,158 +7520,61 @@ expires until the statement bundle is collected`, Category: builtinconstants.CategorySystemInfo, }, tree.Overload{ + // If the second arg is set to true, this overload allows the caller to + // execute a "stripped" fingerprint on the latest keys in a span. This + // stripped fingerprint strips each key's timestamp and index prefix + // before hashing, enabling a user to assert that two different tables + // have the same latest keys, for example. Because the index prefix is + // stripped, this option should only get used in the table key space. + // + // If the stripped param is set to false, this overload is equivalent to + // 'crdb_internal.fingerprint(span,NULL,LATEST)' + Types: tree.ParamTypes{ {Name: "span", Typ: types.BytesArray}, - {Name: "start_time", Typ: types.TimestampTZ}, - {Name: "all_revisions", Typ: types.Bool}, + {Name: "stripped", Typ: types.Bool}, // NB: The function can be called with an AOST clause that will be used // as the `end_time` when issuing the ExportRequests for the purposes of // fingerprinting. }, ReturnType: tree.FixedReturnType(types.Int), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { - ctx, sp := tracing.ChildSpan(ctx, "crdb_internal.fingerprint") - defer sp.Finish() - - if !evalCtx.Settings.Version.IsActive(ctx, clusterversion.V23_1) { - return nil, errors.Errorf("cannot use crdb_internal.fingerprint until the cluster version is at least %s", - clusterversion.V23_1.String()) + if len(args) != 2 { + return nil, errors.New("argument list must have two elements") } - - isAdmin, err := evalCtx.SessionAccessor.HasAdminRole(ctx) + span, err := parseSpan(args[0]) if err != nil { return nil, err } - if !isAdmin { - return nil, errors.New("crdb_internal.fingerprint() requires admin privilege") + skipTimestamp := bool(tree.MustBeDBool(args[1])) + return fingerprint(ctx, evalCtx, span, hlc.Timestamp{} /* allRevisions */, false, + skipTimestamp) + }, + Info: "This function is used only by CockroachDB's developers for testing purposes.", + Volatility: volatility.Stable, + }, + tree.Overload{ + Types: tree.ParamTypes{ + {Name: "span", Typ: types.BytesArray}, + {Name: "start_time", Typ: types.TimestampTZ}, + {Name: "all_revisions", Typ: types.Bool}, + // NB: The function can be called with an AOST clause that will be used + // as the `end_time` when issuing the ExportRequests for the purposes of + // fingerprinting. + }, + ReturnType: tree.FixedReturnType(types.Int), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + if len(args) != 3 { + return nil, errors.New("argument list must have three elements") } - - arr := tree.MustBeDArray(args[0]) - if arr.Len() != 2 { - return nil, errors.New("expected an array of two elements") + span, err := parseSpan(args[0]) + if err != nil { + return nil, err } - startKey := []byte(tree.MustBeDBytes(arr.Array[0])) - endKey := []byte(tree.MustBeDBytes(arr.Array[1])) - startTime := args[1].(*tree.DTimestampTZ).Time + startTime := tree.MustBeDTimestampTZ(args[1]).Time startTimestamp := hlc.Timestamp{WallTime: startTime.UnixNano()} - allRevisions := *args[2].(*tree.DBool) - filter := kvpb.MVCCFilter_Latest - if allRevisions { - filter = kvpb.MVCCFilter_All - } - - header := kvpb.Header{ - Timestamp: evalCtx.Txn.ReadTimestamp(), - // We set WaitPolicy to Error, so that the export will return an error - // to us instead of a blocking wait if it hits any other txns. - // - // TODO(adityamaru): We might need to handle WriteIntentErrors - // specially in the future so as to allow the fingerprint to complete - // in the face of intents. - WaitPolicy: lock.WaitPolicy_Error, - // TODO(ssd): Setting this disables async sending in - // DistSender so it likely substantially impacts - // performance. - ReturnElasticCPUResumeSpans: true, - } - admissionHeader := kvpb.AdmissionHeader{ - Priority: int32(admissionpb.BulkNormalPri), - CreateTime: timeutil.Now().UnixNano(), - Source: kvpb.AdmissionHeader_FROM_SQL, - NoMemoryReservedAtSource: true, - } - - todo := make(chan kvpb.RequestHeader, 1) - todo <- kvpb.RequestHeader{Key: startKey, EndKey: endKey} - - ctxDone := ctx.Done() - var fingerprint uint64 - // TODO(adityamaru): Memory monitor this slice of buffered SSTs that - // contain range keys across ExportRequests. - ssts := make([][]byte, 0) - for { - select { - case <-ctxDone: - return nil, ctx.Err() - case reqHeader := <-todo: - req := &kvpb.ExportRequest{ - RequestHeader: reqHeader, - StartTime: startTimestamp, - MVCCFilter: filter, - ExportFingerprint: true, - } - var rawResp kvpb.Response - var pErr *kvpb.Error - exportRequestErr := contextutil.RunWithTimeout(ctx, - fmt.Sprintf("ExportRequest fingerprint for span %s", roachpb.Span{Key: startKey, EndKey: endKey}), - 5*time.Minute, func(ctx context.Context) error { - rawResp, pErr = kv.SendWrappedWithAdmission(ctx, - evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req) - if pErr != nil { - return pErr.GoError() - } - return nil - }) - if exportRequestErr != nil { - return nil, exportRequestErr - } - - resp := rawResp.(*kvpb.ExportResponse) - for _, file := range resp.Files { - fingerprint = fingerprint ^ file.Fingerprint - - // Aggregate all the range keys that need fingerprinting once all - // ExportRequests have been completed. - if len(file.SST) != 0 { - ssts = append(ssts, file.SST) - } - } - if resp.ResumeSpan != nil { - if !resp.ResumeSpan.Valid() { - return nil, errors.Errorf("invalid resume span: %s", resp.ResumeSpan) - } - todo <- kvpb.RequestHeaderFromSpan(*resp.ResumeSpan) - } - default: - // No ExportRequests left to send. We've aggregated range keys - // across all ExportRequests and can now fingerprint them. - // - // NB: We aggregate rangekeys across ExportRequests and then - // fingerprint them on the client, instead of fingerprinting them as - // part of the ExportRequest command evaluation, because range keys - // do not have a stable, discrete identity. Their fragmentation can - // be influenced by rangekeys outside the time interval that we are - // fingerprinting, or by range splits. So, we need to "defragment" - // all the rangekey stacks we observe such that the fragmentation is - // deterministic on only the data we want to fingerprint in our key - // and time interval. - // - // Egs: - // - // t2 [-----)[----) - // - // t1 [----)[-----) - // a b c d - // - // Assume we have two rangekeys [a, c)@t1 and [b, d)@t2. They will - // fragment as shown in the diagram above. If we wish to fingerprint - // key [a-d) in time interval (t1, t2] the fragmented rangekey - // [a, c)@t1 is outside our time interval and should not influence our - // fingerprint. The iterator in `fingerprintRangekeys` will - // "defragment" the rangekey stacks [b-c)@t2 and [c-d)@t2 and - // fingerprint them as a single rangekey with bounds [b-d)@t2. - rangekeyFingerprint, err := storage.FingerprintRangekeys(ctx, evalCtx.Settings, - storage.MVCCExportFingerprintOptions{ - StripTenantPrefix: true, - StripValueChecksum: true, - }, ssts) - if err != nil { - return nil, err - } - fingerprint = fingerprint ^ rangekeyFingerprint - return tree.NewDInt(tree.DInt(fingerprint)), nil - } - } + allRevisions := bool(tree.MustBeDBool(args[2])) + return fingerprint(ctx, evalCtx, span, startTimestamp, allRevisions /* stripped */, false) }, Info: "This function is used only by CockroachDB's developers for testing purposes.", Volatility: volatility.Stable, @@ -10595,3 +10494,13 @@ func prettyStatement(p tree.PrettyCfg, stmt string) (string, error) { } return formattedStmt.String(), nil } + +func parseSpan(arg tree.Datum) (roachpb.Span, error) { + arr := tree.MustBeDArray(arg) + if arr.Len() != 2 { + return roachpb.Span{}, errors.New("expected an array of two elements") + } + startKey := []byte(tree.MustBeDBytes(arr.Array[0])) + endKey := []byte(tree.MustBeDBytes(arr.Array[1])) + return roachpb.Span{Key: startKey, EndKey: endKey}, nil +} diff --git a/pkg/sql/sem/builtins/fingerprint_builtin_test.go b/pkg/sql/sem/builtins/fingerprint_builtin_test.go index 7fe2451a81c8..8928d103d35f 100644 --- a/pkg/sql/sem/builtins/fingerprint_builtin_test.go +++ b/pkg/sql/sem/builtins/fingerprint_builtin_test.go @@ -273,3 +273,24 @@ FROM db.QueryRow(t, fingerprintQuery).Scan(&fingerprint4) require.NotEqual(t, fingerprint1, fingerprint4) } + +func TestFingerprintStripped(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer s.Stopper().Stop(ctx) + db := sqlutils.MakeSQLRunner(sqlDB) + db.Exec(t, "CREATE DATABASE IF NOT EXISTS test") + db.Exec(t, "CREATE TABLE test.test (k PRIMARY KEY) AS SELECT generate_series(1, 10)") + + // Create the same sql rows in a different table, committed at a different timestamp. + db.Exec(t, "CREATE TABLE test.test2 (k PRIMARY KEY) AS SELECT generate_series(1, 10)") + + strippedFingerprint := func(tableName string) int { + tableID := sqlutils.QueryTableID(t, sqlDB, "test", "public", tableName) + return sqlutils.FingerprintTable(t, db, tableID) + } + require.Equal(t, strippedFingerprint("test"), strippedFingerprint("test2")) +} diff --git a/pkg/sql/sem/builtins/fingerprint_builtins.go b/pkg/sql/sem/builtins/fingerprint_builtins.go new file mode 100644 index 000000000000..4f9d0c4579e1 --- /dev/null +++ b/pkg/sql/sem/builtins/fingerprint_builtins.go @@ -0,0 +1,174 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package builtins + +import ( + "context" + "fmt" + "time" + + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/kv/kvpb" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" + "github.com/cockroachdb/cockroach/pkg/util/contextutil" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/errors" +) + +func fingerprint( + ctx context.Context, + evalCtx *eval.Context, + span roachpb.Span, + startTime hlc.Timestamp, + allRevisions, stripped bool, +) (tree.Datum, error) { + ctx, sp := tracing.ChildSpan(ctx, "crdb_internal.fingerprint") + defer sp.Finish() + + if !evalCtx.Settings.Version.IsActive(ctx, clusterversion.V23_1) { + return nil, errors.Errorf("cannot use crdb_internal.fingerprint until the cluster version is at least %s", + clusterversion.V23_1.String()) + } + + isAdmin, err := evalCtx.SessionAccessor.HasAdminRole(ctx) + if err != nil { + return nil, err + } + if !isAdmin { + return nil, errors.New("crdb_internal.fingerprint() requires admin privilege") + } + + filter := kvpb.MVCCFilter_Latest + if allRevisions { + filter = kvpb.MVCCFilter_All + } + header := kvpb.Header{ + Timestamp: evalCtx.Txn.ReadTimestamp(), + // We set WaitPolicy to Error, so that the export will return an error + // to us instead of a blocking wait if it hits any other txns. + // + // TODO(adityamaru): We might need to handle WriteIntentErrors + // specially in the future so as to allow the fingerprint to complete + // in the face of intents. + WaitPolicy: lock.WaitPolicy_Error, + // TODO(ssd): Setting this disables async sending in + // DistSender so it likely substantially impacts + // performance. + ReturnElasticCPUResumeSpans: true, + } + admissionHeader := kvpb.AdmissionHeader{ + Priority: int32(admissionpb.BulkNormalPri), + CreateTime: timeutil.Now().UnixNano(), + Source: kvpb.AdmissionHeader_FROM_SQL, + NoMemoryReservedAtSource: true, + } + + todo := make(chan kvpb.RequestHeader, 1) + todo <- kvpb.RequestHeader{Key: span.Key, EndKey: span.EndKey} + ctxDone := ctx.Done() + var fingerprint uint64 + // TODO(adityamaru): Memory monitor this slice of buffered SSTs that + // contain range keys across ExportRequests. + ssts := make([][]byte, 0) + for { + select { + case <-ctxDone: + return nil, ctx.Err() + case reqHeader := <-todo: + req := &kvpb.ExportRequest{ + RequestHeader: reqHeader, + StartTime: startTime, + MVCCFilter: filter, + ExportFingerprint: true, + FingerprintOptions: kvpb.FingerprintOptions{StripIndexPrefixAndTimestamp: stripped}} + var rawResp kvpb.Response + var pErr *kvpb.Error + exportRequestErr := contextutil.RunWithTimeout(ctx, + fmt.Sprintf("ExportRequest fingerprint for span %s", roachpb.Span{Key: span.Key, + EndKey: span.EndKey}), + 5*time.Minute, func(ctx context.Context) error { + rawResp, pErr = kv.SendWrappedWithAdmission(ctx, + evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req) + if pErr != nil { + return pErr.GoError() + } + return nil + }) + if exportRequestErr != nil { + return nil, exportRequestErr + } + + resp := rawResp.(*kvpb.ExportResponse) + for _, file := range resp.Files { + fingerprint = fingerprint ^ file.Fingerprint + + // Aggregate all the range keys that need fingerprinting once all + // ExportRequests have been completed. + if len(file.SST) != 0 { + ssts = append(ssts, file.SST) + } + } + if resp.ResumeSpan != nil { + if !resp.ResumeSpan.Valid() { + return nil, errors.Errorf("invalid resume span: %s", resp.ResumeSpan) + } + todo <- kvpb.RequestHeaderFromSpan(*resp.ResumeSpan) + } + default: + // No ExportRequests left to send. We've aggregated range keys + // across all ExportRequests and can now fingerprint them. + // + // NB: We aggregate rangekeys across ExportRequests and then + // fingerprint them on the client, instead of fingerprinting them as + // part of the ExportRequest command evaluation, because range keys + // do not have a stable, discrete identity. Their fragmentation can + // be influenced by rangekeys outside the time interval that we are + // fingerprinting, or by range splits. So, we need to "defragment" + // all the rangekey stacks we observe such that the fragmentation is + // deterministic on only the data we want to fingerprint in our key + // and time interval. + // + // Egs: + // + // t2 [-----)[----) + // + // t1 [----)[-----) + // a b c d + // + // Assume we have two rangekeys [a, c)@t1 and [b, d)@t2. They will + // fragment as shown in the diagram above. If we wish to fingerprint + // key [a-d) in time interval (t1, t2] the fragmented rangekey + // [a, c)@t1 is outside our time interval and should not influence our + // fingerprint. The iterator in `fingerprintRangekeys` will + // "defragment" the rangekey stacks [b-c)@t2 and [c-d)@t2 and + // fingerprint them as a single rangekey with bounds [b-d)@t2. + rangekeyFingerprint, err := storage.FingerprintRangekeys(ctx, evalCtx.Settings, + storage.MVCCExportFingerprintOptions{ + StripTenantPrefix: true, + StripValueChecksum: true, + StripIndexPrefixAndTimestamp: stripped, + }, ssts) + if err != nil { + return nil, err + } + fingerprint = fingerprint ^ rangekeyFingerprint + return tree.NewDInt(tree.DInt(fingerprint)), nil + } + } +} diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 680d96e4a513..f6578eedeec2 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2377,6 +2377,7 @@ var builtinOidsArray = []string{ 2403: `ts_rank(vector: tsvector, query: tsquery, normalization: int) -> float4`, 2404: `ts_rank(vector: tsvector, query: tsquery) -> float4`, 2405: `ts_rank(weights: float[], vector: tsvector, query: tsquery) -> float4`, + 2406: `crdb_internal.fingerprint(span: bytes[], stripped: bool) -> int`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/storage/fingerprint_writer.go b/pkg/storage/fingerprint_writer.go index dc79afa324d1..fc24a35d00aa 100644 --- a/pkg/storage/fingerprint_writer.go +++ b/pkg/storage/fingerprint_writer.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" ) @@ -114,13 +115,11 @@ func (f *fingerprintWriter) PutRawMVCCRangeKey(key MVCCRangeKey, bytes []byte) e // PutRawMVCC implements the Writer interface. func (f *fingerprintWriter) PutRawMVCC(key MVCCKey, value []byte) error { defer f.hasher.Reset() - // Hash the key/timestamp and value of the RawMVCC. if err := f.hashKey(key.Key); err != nil { return err } - f.timestampBuf = EncodeMVCCTimestampToBuf(f.timestampBuf, key.Timestamp) - if err := f.hash(f.timestampBuf); err != nil { + if err := f.hashTimestamp(key.Timestamp); err != nil { return err } if err := f.hashValue(value); err != nil { @@ -147,12 +146,26 @@ func (f *fingerprintWriter) PutUnversioned(key roachpb.Key, value []byte) error } func (f *fingerprintWriter) hashKey(key []byte) error { + if f.options.StripIndexPrefixAndTimestamp { + return f.hash(f.stripIndexPrefix(key)) + } if f.options.StripTenantPrefix { return f.hash(f.stripTenantPrefix(key)) } return f.hash(key) } +func (f *fingerprintWriter) hashTimestamp(timestamp hlc.Timestamp) error { + if f.options.StripIndexPrefixAndTimestamp { + return nil + } + f.timestampBuf = EncodeMVCCTimestampToBuf(f.timestampBuf, timestamp) + if err := f.hash(f.timestampBuf); err != nil { + return err + } + return nil +} + func (f *fingerprintWriter) hashValue(value []byte) error { if f.options.StripValueChecksum { return f.hash(f.stripValueChecksum(value)) @@ -177,7 +190,15 @@ func (f *fingerprintWriter) stripValueChecksum(value []byte) []byte { } func (f *fingerprintWriter) stripTenantPrefix(key []byte) []byte { - remainder, _, err := keys.DecodeTenantPrefixE(key) + remainder, err := keys.StripTenantPrefix(key) + if err != nil { + return key + } + return remainder +} + +func (f *fingerprintWriter) stripIndexPrefix(key []byte) []byte { + remainder, err := keys.StripIndexPrefix(key) if err != nil { return key } @@ -247,8 +268,7 @@ func FingerprintRangekeys( return 0, err } for _, v := range stack.Versions { - fw.timestampBuf = EncodeMVCCTimestampToBuf(fw.timestampBuf, v.Timestamp) - if err := fw.hash(fw.timestampBuf); err != nil { + if err := fw.hashTimestamp(v.Timestamp); err != nil { return 0, err } mvccValue, ok, err := tryDecodeSimpleMVCCValue(v.Value) diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index b7035833f9cb..bf324a0f482c 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -6798,6 +6798,10 @@ type MVCCExportFingerprintOptions struct { // If StripValueChecksum is true, checksums are removed from // the value before hashing. StripValueChecksum bool + // If StripIndexPrefixAndTimestamp is true, the key's timestamp and index + // prefix are not hashed. Because the index prefix is stripped, this option + // should only get used in the table key space. + StripIndexPrefixAndTimestamp bool } // MVCCIsSpanEmptyOptions configures the MVCCIsSpanEmpty function. diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index f36cd8abd3d7..f77c62fd045a 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -1381,8 +1381,9 @@ func cmdExport(e *evalCtx) error { ExportAllRevisions: e.hasArg("allRevisions"), StopMidKey: e.hasArg("stopMidKey"), FingerprintOptions: storage.MVCCExportFingerprintOptions{ - StripTenantPrefix: e.hasArg("stripTenantPrefix"), - StripValueChecksum: e.hasArg("stripValueChecksum"), + StripTenantPrefix: e.hasArg("stripTenantPrefix"), + StripValueChecksum: e.hasArg("stripValueChecksum"), + StripIndexPrefixAndTimestamp: e.hasArg("stripped"), }, } if e.hasArg("maxIntents") { diff --git a/pkg/storage/testdata/mvcc_histories/export_fingerprint_no_timestamps b/pkg/storage/testdata/mvcc_histories/export_fingerprint_no_timestamps new file mode 100644 index 000000000000..821b22190130 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/export_fingerprint_no_timestamps @@ -0,0 +1,77 @@ +# Test MVCC Fingerprinting when configured to ignore timestamps +# +# This test ensures that when the stripTimestamp flag is applied to a fingerpint export request, the +# timestamps in the MVCC history of a key span are ignored. To test, a simple history is +# constructed, fingerprinted, and wiped. Then the same history is reconstructed with the timestamps +# shifted up. + +# Sets up the following dataset, where x is MVCC point tombstone, o-o is MVCC +# range tombstone, [] is intent. We include some local timestamps, which should +# not be export fingerprinted. +# +# 6 +# 5 x o---o +# 4 a2 b1 o------o +# ----------------------------------------- +# 3 o---o +# 2 a2 x +# 1 b1 o-------o +# a b c d e + + +run ok +put k=a ts=2 v=a2 +put k=a ts=4 v=a2 +put k=b ts=1 v=b1 +del k=b ts=2 +put k=b ts=4 v=b1 +del k=b ts=5 +del_range_ts k=c end=e ts=1 +del_range_ts k=c end=d ts=3 +del_range_ts k=c end=e ts=4 +del_range_ts k=c end=d ts=5 +---- +del: "b": found key true +del: "b": found key true +>> at end: +rangekey: {c-d}/[5.000000000,0=/ 4.000000000,0=/ 3.000000000,0=/ 1.000000000,0=/] +rangekey: {d-e}/[4.000000000,0=/ 1.000000000,0=/] +data: "a"/4.000000000,0 -> /BYTES/a2 +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/5.000000000,0 -> / +data: "b"/4.000000000,0 -> /BYTES/b1 +data: "b"/2.000000000,0 -> / +data: "b"/1.000000000,0 -> /BYTES/b1 + +# test that history is equal aost 3 and 5 +run ok +export fingerprint k=a end=e ts=5 stripped +---- +export: data_size:8 fingerprint=true +fingerprint: 4117139284438927868 + +run ok +export fingerprint k=a end=e ts=3 stripped +---- +export: data_size:8 fingerprint=true +fingerprint: 4117139284438927868 + +# test that history is different at aost 1 +run ok +export fingerprint k=a end=e ts=1 stripped +---- +export: data_size:8 fingerprint=true +fingerprint: 8343199592647421771 + +# test that history is same in rangekey land -- they never get hashed +run ok +export fingerprint k=c end=e ts=1 stripped +---- +export: fingerprint=true +fingerprint: 0 + +run ok +export fingerprint k=c end=e ts=5 stripped +---- +export: fingerprint=true +fingerprint: 0 diff --git a/pkg/testutils/sqlutils/BUILD.bazel b/pkg/testutils/sqlutils/BUILD.bazel index 4866f14d94d8..c28688b2f3a8 100644 --- a/pkg/testutils/sqlutils/BUILD.bazel +++ b/pkg/testutils/sqlutils/BUILD.bazel @@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "sqlutils", srcs = [ + "fingerprint.go", "inject.go", "pg_url.go", "pretty.go", diff --git a/pkg/testutils/sqlutils/fingerprint.go b/pkg/testutils/sqlutils/fingerprint.go new file mode 100644 index 000000000000..6142275ba462 --- /dev/null +++ b/pkg/testutils/sqlutils/fingerprint.go @@ -0,0 +1,30 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package sqlutils + +import ( + "fmt" + "testing" +) + +func FingerprintTable(t testing.TB, sqlDB *SQLRunner, tableID uint32) int { + fingerprintQuery := fmt.Sprintf(` +SELECT * +FROM + crdb_internal.fingerprint( + crdb_internal.table_span(%d), + true + )`, tableID) + + var fingerprint int + sqlDB.QueryRow(t, fingerprintQuery).Scan(&fingerprint) + return fingerprint +}