From 91f452f879e02577166377925b162dd980a61720 Mon Sep 17 00:00:00 2001 From: j82w Date: Thu, 17 Aug 2023 17:41:45 -0400 Subject: [PATCH] sql: fix sql compaction job full scan The sql-stats-compaction is failing with TransactionRetryError. This is caused by the internal executor uses the zero-values for the settings, rather than the cluster defaults. This causes `SET reorder_joins_limit = 0;` which then causes the `sql-stats-compaction` delete statement to do a full scan. The full scan is causing the query to take a long time causing other queries to conflict with it. Error: `TransactionRetryWithProtoRefreshError: TransactionRetryError: retry txn (RETRY_SERIALIZABLE - failed preemptive refresh due to a conflict: committed value on key` The fix to use the correct default value instead of 0 is made in https://github.com/cockroachdb/cockroach/pull/101486. Solution is to change the query to avoid the join and thus the full scan. Fixes: #108936 Release note (sql change): Optimized the sql-stats-compaction job delete query to avoid full scan. This helps avoid the TransactionRetryError which can cause the job to fail. --- .../testdata/sql_activity_stats_compaction | 136 +++++------------- .../persistedsqlstats/compaction_exec.go | 20 +-- 2 files changed, 40 insertions(+), 116 deletions(-) diff --git a/pkg/sql/opt/exec/execbuilder/testdata/sql_activity_stats_compaction b/pkg/sql/opt/exec/execbuilder/testdata/sql_activity_stats_compaction index 9e837c118008..d95795fab2c5 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/sql_activity_stats_compaction +++ b/pkg/sql/opt/exec/execbuilder/testdata/sql_activity_stats_compaction @@ -219,14 +219,11 @@ ALTER TABLE system.statement_statistics INJECT STATISTICS '[ query T EXPLAIN (VERBOSE) DELETE FROM system.statement_statistics -WHERE (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id - FROM system.statement_statistics - WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = 0 - AND aggregated_ts < $current_agg_ts - ORDER BY aggregated_ts ASC - LIMIT 1024 -) RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id +WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = 0 + AND aggregated_ts < $current_agg_ts +ORDER BY aggregated_ts ASC +LIMIT 1024 +RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id ---- distribution: local vectorized: true @@ -236,7 +233,7 @@ vectorized: true │ └── • delete │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8) - │ estimated row count: 0 + │ estimated row count: 1,024 │ from: statement_statistics │ auto commit │ @@ -261,25 +258,12 @@ vectorized: true │ render total_estimated_execution_time: total_estimated_execution_time │ render p99_latency: p99_latency │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) - │ - └── • lookup join (inner) - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ estimated row count: 0 - │ table: statement_statistics@primary - │ equality: (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) = (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8,aggregated_ts,fingerprint_id,transaction_fingerprint_id,plan_hash,app_name,node_id) - │ equality cols are key - │ - └── • scan - columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8) - estimated row count: 1,024 (0.10% of the table; stats collected ago) - table: statement_statistics@primary - spans: /0-/0/2022-05-04T15:59:59.999999001Z - limit: 1024 + └── • scan + columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) + estimated row count: 1,024 (0.10% of the table; stats collected ago) + table: statement_statistics@primary + spans: /0-/0/2022-05-04T15:59:59.999999001Z + limit: 1024 statement ok ALTER TABLE system.transaction_statistics INJECT STATISTICS '[ @@ -422,14 +406,11 @@ ALTER TABLE system.transaction_statistics INJECT STATISTICS '[ query T EXPLAIN (VERBOSE) DELETE FROM system.transaction_statistics -WHERE (aggregated_ts, fingerprint_id, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, app_name, node_id - FROM system.transaction_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8 = 0 AND aggregated_ts < $current_agg_ts ORDER BY aggregated_ts ASC LIMIT 1024 -) RETURNING aggregated_ts, fingerprint_id, app_name, node_id + RETURNING aggregated_ts, fingerprint_id, app_name, node_id ---- distribution: local vectorized: true @@ -439,7 +420,7 @@ vectorized: true │ └── • delete │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8) - │ estimated row count: 0 + │ estimated row count: 1,024 │ from: transaction_statistics │ auto commit │ @@ -461,32 +442,16 @@ vectorized: true │ render total_estimated_execution_time: total_estimated_execution_time │ render p99_latency: p99_latency │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency, aggregated_ts, fingerprint_id, app_name, node_id) - │ - └── • lookup join (inner) - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ estimated row count: 0 - │ table: transaction_statistics@primary - │ equality: (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, aggregated_ts, fingerprint_id, app_name, node_id) = (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8,aggregated_ts,fingerprint_id,app_name,node_id) - │ equality cols are key - │ - └── • scan - columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8) - estimated row count: 1,024 (0.10% of the table; stats collected ago) - table: transaction_statistics@primary - spans: /0-/0/2022-05-04T15:59:59.999999001Z - limit: 1024 + └── • scan + columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) + estimated row count: 1,024 (0.10% of the table; stats collected ago) + table: transaction_statistics@primary + spans: /0-/0/2022-05-04T15:59:59.999999001Z + limit: 1024 query T EXPLAIN (VERBOSE) DELETE FROM system.statement_statistics -WHERE (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) IN ( -SELECT aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id -FROM system.statement_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = 0 AND ( ( @@ -501,7 +466,7 @@ AND ( AND aggregated_ts < $current_agg_ts ORDER BY aggregated_ts ASC LIMIT 1024 -) RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id + RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id ---- distribution: local vectorized: true @@ -511,7 +476,7 @@ vectorized: true │ └── • delete │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8) - │ estimated row count: 0 + │ estimated row count: 1,024 │ from: statement_statistics │ auto commit │ @@ -536,32 +501,16 @@ vectorized: true │ render total_estimated_execution_time: total_estimated_execution_time │ render p99_latency: p99_latency │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) - │ - └── • lookup join (inner) - │ columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ estimated row count: 0 - │ table: statement_statistics@primary - │ equality: (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) = (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8,aggregated_ts,fingerprint_id,transaction_fingerprint_id,plan_hash,app_name,node_id) - │ equality cols are key - │ - └── • scan - columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8) - estimated row count: 1,024 (0.10% of the table; stats collected ago) - table: statement_statistics@primary - spans: /0/2022-05-04T14:00:00Z/"123"/"234"/"345"/"test"/1-/0/2022-05-04T15:59:59.999999001Z - limit: 1024 + └── • scan + columns: (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id, statistics, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) + estimated row count: 1,024 (0.10% of the table; stats collected ago) + table: statement_statistics@primary + spans: /0/2022-05-04T14:00:00Z/"123"/"234"/"345"/"test"/1-/0/2022-05-04T15:59:59.999999001Z + limit: 1024 query T EXPLAIN (VERBOSE) DELETE FROM system.transaction_statistics - WHERE (aggregated_ts, fingerprint_id, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, app_name, node_id - FROM system.transaction_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8 = 0 AND ( ( @@ -574,7 +523,7 @@ DELETE FROM system.transaction_statistics AND aggregated_ts < $current_agg_ts ORDER BY aggregated_ts ASC LIMIT 1024 - ) RETURNING aggregated_ts, fingerprint_id, app_name, node_id + RETURNING aggregated_ts, fingerprint_id, app_name, node_id ---- distribution: local vectorized: true @@ -584,7 +533,7 @@ vectorized: true │ └── • delete │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8) - │ estimated row count: 0 + │ estimated row count: 1,024 │ from: transaction_statistics │ auto commit │ @@ -606,25 +555,12 @@ vectorized: true │ render total_estimated_execution_time: total_estimated_execution_time │ render p99_latency: p99_latency │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ - └── • project - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency, aggregated_ts, fingerprint_id, app_name, node_id) - │ - └── • lookup join (inner) - │ columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) - │ estimated row count: 0 - │ table: transaction_statistics@primary - │ equality: (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, aggregated_ts, fingerprint_id, app_name, node_id) = (crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8,aggregated_ts,fingerprint_id,app_name,node_id) - │ equality cols are key - │ - └── • scan - columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8) - estimated row count: 1,024 (0.10% of the table; stats collected ago) - table: transaction_statistics@primary - spans: /0/2022-05-04T14:00:00Z/"123"/"test"/2-/0/2022-05-04T15:59:59.999999001Z - limit: 1024 + └── • scan + columns: (aggregated_ts, fingerprint_id, app_name, node_id, crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8, execution_count, service_latency, cpu_sql_nanos, contention_time, total_estimated_execution_time, p99_latency) + estimated row count: 1,024 (0.10% of the table; stats collected ago) + table: transaction_statistics@primary + spans: /0/2022-05-04T14:00:00Z/"123"/"test"/2-/0/2022-05-04T15:59:59.999999001Z + limit: 1024 statement ok RESET CLUSTER SETTING sql.stats.flush.interval diff --git a/pkg/sql/sqlstats/persistedsqlstats/compaction_exec.go b/pkg/sql/sqlstats/persistedsqlstats/compaction_exec.go index 6fcb0ff8a6b6..1ce40211c701 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/compaction_exec.go +++ b/pkg/sql/sqlstats/persistedsqlstats/compaction_exec.go @@ -284,19 +284,13 @@ var ( WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = $1`, unconstrainedDeleteStmt: ` DELETE FROM system.statement_statistics - WHERE (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id - FROM system.statement_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = $1 AND aggregated_ts < $3 ORDER BY aggregated_ts ASC LIMIT $2 - ) RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id`, + RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id`, constrainedDeleteStmt: ` DELETE FROM system.statement_statistics - WHERE (aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id - FROM system.statement_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_plan_hash_transaction_fingerprint_id_shard_8 = $1 AND ( ( @@ -311,7 +305,7 @@ var ( AND aggregated_ts < $3 ORDER BY aggregated_ts ASC LIMIT $2 - ) RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id`, + RETURNING aggregated_ts, fingerprint_id, transaction_fingerprint_id, plan_hash, app_name, node_id`, } txnStatsCleanupOps = &cleanupOperations{ initialScanStmtTemplate: ` @@ -321,19 +315,13 @@ var ( WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8 = $1`, unconstrainedDeleteStmt: ` DELETE FROM system.transaction_statistics - WHERE (aggregated_ts, fingerprint_id, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, app_name, node_id - FROM system.transaction_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8 = $1 AND aggregated_ts < $3 ORDER BY aggregated_ts ASC LIMIT $2 - ) RETURNING aggregated_ts, fingerprint_id, app_name, node_id`, + RETURNING aggregated_ts, fingerprint_id, app_name, node_id`, constrainedDeleteStmt: ` DELETE FROM system.transaction_statistics - WHERE (aggregated_ts, fingerprint_id, app_name, node_id) IN ( - SELECT aggregated_ts, fingerprint_id, app_name, node_id - FROM system.transaction_statistics WHERE crdb_internal_aggregated_ts_app_name_fingerprint_id_node_id_shard_8 = $1 AND ( ( @@ -346,7 +334,7 @@ var ( AND aggregated_ts < $3 ORDER BY aggregated_ts ASC LIMIT $2 - ) RETURNING aggregated_ts, fingerprint_id, app_name, node_id`, + RETURNING aggregated_ts, fingerprint_id, app_name, node_id`, } )