From 8372915ccb6a35708436ec72b427ef7b417b589e Mon Sep 17 00:00:00 2001
From: Arul Ajmani <arulajmani@gmail.com>
Date: Wed, 31 May 2023 09:59:44 -0400
Subject: [PATCH 1/2] concurrency: extend testing for
 waitQueueMaxLengthExceeded

This patch adds a test for the rare scenario where a request re-scans
the lock table and finds itself already waiting at a lock. We test the
case where adding a new request to the lock's wait queue would cause
the waitQueueMaxLengthExceeded state to be triggered -- however, this
shouldn't happen, as the request has already been accounted for.

Note that this wasn't broken; we're only adding a test here to ensure
we don't regress this behavior as we go about refactoring
`tryActiveWait`. Tested using the diff below and confirmed the test does
indeed fail:

```
--- a/pkg/kv/kvserver/concurrency/lock_table.go
+++ b/pkg/kv/kvserver/concurrency/lock_table.go
@@ -1706,7 +1706,7 @@ func (l *lockState) tryActiveWait(
        defer g.mu.Unlock()
        if str == lock.Intent {
                var qg *queuedGuard
-               if _, inQueue := g.mu.locks[l]; inQueue {
+               if _, inQueue := g.mu.locks[l]; inQueue && false {
```

Release note: None

Informs: #102210
---
 .../testdata/lock_table/queue_length_exceeded | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/pkg/kv/kvserver/concurrency/testdata/lock_table/queue_length_exceeded b/pkg/kv/kvserver/concurrency/testdata/lock_table/queue_length_exceeded
index f51c5c551a41..7b5de64a8c76 100644
--- a/pkg/kv/kvserver/concurrency/testdata/lock_table/queue_length_exceeded
+++ b/pkg/kv/kvserver/concurrency/testdata/lock_table/queue_length_exceeded
@@ -177,3 +177,76 @@ num=1
     active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
     active: true req: 3, txn: 00000000-0000-0000-0000-000000000003
    distinguished req: 2
+
+# ------------------------------------------------------------------------------
+# Write requests that are already waiting in a lock's wait queue should not
+# throw an error when they rescan if adding one more request to the lock's wait
+# queue puts them above the max-lock-wait-queue threshold. This is because
+# the request has already been accounted for when it's performing a re-scan.
+# ------------------------------------------------------------------------------
+
+new-txn txn=txn6 ts=10 epoch=0
+----
+
+new-request r=req9 txn=txn6 ts=10 spans=intent@b
+----
+
+scan r=req9
+----
+start-waiting: false
+
+acquire r=req9 k=b durability=u
+----
+num=2
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 10.000000000,0, info: unrepl epoch: 0, seqs: [0]
+   queued writers:
+    active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 3, txn: 00000000-0000-0000-0000-000000000003
+   distinguished req: 2
+ lock: "b"
+  holder: txn: 00000000-0000-0000-0000-000000000006, ts: 10.000000000,0, info: unrepl epoch: 0, seqs: [0]
+
+new-txn txn=txn7 ts=10 epoch=0
+----
+
+new-request r=req10 txn=txn7 ts=10 spans=intent@b max-lock-wait-queue-length=1
+----
+
+scan r=req10
+----
+start-waiting: true
+
+guard-state r=req10
+----
+new: state=waitForDistinguished txn=txn6 key="b" held=true guard-strength=Intent
+
+print
+----
+num=2
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 10.000000000,0, info: unrepl epoch: 0, seqs: [0]
+   queued writers:
+    active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 3, txn: 00000000-0000-0000-0000-000000000003
+   distinguished req: 2
+ lock: "b"
+  holder: txn: 00000000-0000-0000-0000-000000000006, ts: 10.000000000,0, info: unrepl epoch: 0, seqs: [0]
+   queued writers:
+    active: true req: 10, txn: 00000000-0000-0000-0000-000000000007
+   distinguished req: 10
+
+# Re-scan.
+scan r=req10
+----
+start-waiting: true
+
+# Note that the state here changed from waitForDistinguished to waitFor. This
+# is because we're cheating slightly in this test by calling scan on a request
+# that's already actively waiting at a lock, which is something that cannot
+# happen outside of unit tests. tryActiveWait doesn't expect this, and doesn't
+# handle this state transition -- we could teach it, but it would be just for
+# this contrived test scenario.
+guard-state r=req10
+----
+new: state=waitFor txn=txn6 key="b" held=true guard-strength=Intent

From 9592779070af4a8bc5184f188bedb4855b9e6fcf Mon Sep 17 00:00:00 2001
From: Michael Erickson <michae2@cockroachlabs.com>
Date: Wed, 31 May 2023 16:13:50 -0700
Subject: [PATCH 2/2] sql/stats: forecast for different columnsets at different
 times

Before this change, all statistics forecasts for a table were at the
same future time, determined as:

  time of most recent statistics collection (for any columnset)
  + average time between automatic collections (incl. all columnsets)

This commit changes the formula slightly to:

  time of most recent statistics collection (for **this** columnset)
  + average time between automatic collections (incl. all columnsets)

Meaning columnsets that were _not_ included in the most recent
statistics collection will now have an older forecast time than
columnsets that _were_ included. Columnsets that were included in the
most recent collection will still all have the same forecast time.

This will have two effects on the optimizer:
1. When using the first table statistic to get a row estimate for the
   table, statistics builder will now favor forecasts of columnsets
   included in the most recent statistics collection over forecasts of
   columnsets not included.
2. Forecasts of columnsets not included in the most recent statistics
   collection will be now be more similar to their most recent
   collection, but also potentially more stale.

Fixes: #103958

Release note (bug fix): Fix a rare bug where stale multi-column table
statistics could cause table statistics forecasts to be inaccurate,
leading to unoptimal query plans.
---
 .../opt/exec/execbuilder/testdata/forecast    | 246 ++++++++++++++++++
 pkg/sql/show_stats.go                         |  13 +-
 pkg/sql/stats/forecast.go                     |  34 +--
 pkg/sql/stats/stats_cache.go                  |   8 +-
 4 files changed, 282 insertions(+), 19 deletions(-)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/forecast b/pkg/sql/opt/exec/execbuilder/testdata/forecast
index a3f33a908287..1f7f17dd7a28 100644
--- a/pkg/sql/opt/exec/execbuilder/testdata/forecast
+++ b/pkg/sql/opt/exec/execbuilder/testdata/forecast
@@ -1603,6 +1603,252 @@ RESET enable_zigzag_join
 statement ok
 RESET optimizer_use_forecasts
 
+# Test for issue #103958.
+
+statement ok
+CREATE TABLE t_103958 (a INT PRIMARY KEY, b INT)
+
+statement ok
+ALTER TABLE t_103958 INJECT STATISTICS '[
+      {
+          "avg_size": 1,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-01 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 3,
+          "columns": [
+              "a",
+              "b"
+          ],
+          "created_at": "2023-01-01 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-01 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 1,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-02 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 3,
+          "columns": [
+              "a",
+              "b"
+          ],
+          "created_at": "2023-01-02 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-02 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 1,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-03 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 3,
+          "columns": [
+              "a",
+              "b"
+          ],
+          "created_at": "2023-01-03 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-03 00:00:00",
+          "distinct_count": 1,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1
+      },
+      {
+          "avg_size": 1,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-04 00:00:00",
+          "distinct_count": 1000,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1000
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-04 00:00:00",
+          "distinct_count": 2,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 1000
+      },
+      {
+          "avg_size": 3,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-05 00:00:00",
+          "distinct_count": 2000,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 2000
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-05 00:00:00",
+          "distinct_count": 3,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 2000
+      },
+      {
+          "avg_size": 3,
+          "columns": [
+              "a"
+          ],
+          "created_at": "2023-01-06 00:00:00",
+          "distinct_count": 3000,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 3000
+      },
+      {
+          "avg_size": 2,
+          "columns": [
+              "b"
+          ],
+          "created_at": "2023-01-06 00:00:00",
+          "distinct_count": 4,
+          "histo_col_type": "",
+          "name": "__auto__",
+          "null_count": 0,
+          "row_count": 3000
+      }
+  ]'
+
+query TTTIIII
+SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
+FROM [SHOW STATISTICS FOR TABLE t_103958 WITH FORECAST]
+----
+__auto__      {a}    2023-01-01 00:00:00 +0000 +0000  1     1     0  1
+__auto__      {a,b}  2023-01-01 00:00:00 +0000 +0000  1     1     0  3
+__auto__      {b}    2023-01-01 00:00:00 +0000 +0000  1     1     0  2
+__auto__      {a}    2023-01-02 00:00:00 +0000 +0000  1     1     0  1
+__auto__      {a,b}  2023-01-02 00:00:00 +0000 +0000  1     1     0  3
+__auto__      {b}    2023-01-02 00:00:00 +0000 +0000  1     1     0  2
+__auto__      {a}    2023-01-03 00:00:00 +0000 +0000  1     1     0  1
+__auto__      {a,b}  2023-01-03 00:00:00 +0000 +0000  1     1     0  3
+__auto__      {b}    2023-01-03 00:00:00 +0000 +0000  1     1     0  2
+__forecast__  {a,b}  2023-01-04 00:00:00 +0000 +0000  1     1     0  3
+__auto__      {a}    2023-01-04 00:00:00 +0000 +0000  1000  1000  0  1
+__auto__      {b}    2023-01-04 00:00:00 +0000 +0000  1000  2     0  2
+__auto__      {a}    2023-01-05 00:00:00 +0000 +0000  2000  2000  0  3
+__auto__      {b}    2023-01-05 00:00:00 +0000 +0000  2000  3     0  2
+__auto__      {a}    2023-01-06 00:00:00 +0000 +0000  3000  3000  0  3
+__auto__      {b}    2023-01-06 00:00:00 +0000 +0000  3000  4     0  2
+
+query T
+SELECT jsonb_pretty(stat)
+FROM (
+  SELECT jsonb_array_elements(statistics) AS stat
+  FROM [SHOW STATISTICS USING JSON FOR TABLE t_103958 WITH FORECAST]
+)
+WHERE stat->>'name' = '__forecast__'
+----
+{
+    "avg_size": 3,
+    "columns": [
+        "a",
+        "b"
+    ],
+    "created_at": "2023-01-04 00:00:00",
+    "distinct_count": 1,
+    "histo_col_type": "",
+    "name": "__forecast__",
+    "null_count": 0,
+    "row_count": 1
+}
+
+query T
+EXPLAIN SELECT * FROM t_103958
+----
+distribution: local
+vectorized: true
+·
+• scan
+  estimated row count: 3,000 (100% of the table; stats collected <hidden> ago)
+  table: t_103958@t_103958_pkey
+  spans: FULL SCAN
+
 # Finally, restore forecasts setting to its previous value.
 statement ok
 SET CLUSTER SETTING sql.stats.forecasts.enabled = $forecastsEnabledPrev
diff --git a/pkg/sql/show_stats.go b/pkg/sql/show_stats.go
index b45c949704fd..704d63bf169c 100644
--- a/pkg/sql/show_stats.go
+++ b/pkg/sql/show_stats.go
@@ -14,6 +14,7 @@ import (
 	"context"
 	encjson "encoding/json"
 	"fmt"
+	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/clusterversion"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog"
@@ -236,14 +237,24 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
 
 				if withForecast {
 					forecasts := stats.ForecastTableStatistics(ctx, statsList)
+					forecastRows := make([]tree.Datums, 0, len(forecasts))
 					// Iterate in reverse order to match the ORDER BY "columnIDs".
 					for i := len(forecasts) - 1; i >= 0; i-- {
 						forecastRow, err := tableStatisticProtoToRow(&forecasts[i].TableStatisticProto, partialStatsVerActive)
 						if err != nil {
 							return nil, err
 						}
-						rows = append(rows, forecastRow)
+						forecastRows = append(forecastRows, forecastRow)
 					}
+					rows = append(forecastRows, rows...)
+					// Some forecasts could have a CreatedAt time before or after some
+					// collected stats, so make sure the list is sorted in ascending
+					// CreatedAt order.
+					sort.SliceStable(rows, func(i, j int) bool {
+						iTime := rows[i][createdAtIdx].(*tree.DTimestamp).Time
+						jTime := rows[j][createdAtIdx].(*tree.DTimestamp).Time
+						return iTime.Before(jTime)
+					})
 				}
 			}
 
diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go
index 868da3e2e706..192d30def2e1 100644
--- a/pkg/sql/stats/forecast.go
+++ b/pkg/sql/stats/forecast.go
@@ -53,20 +53,18 @@ const minGoodnessOfFit = 0.95
 // Whether a forecast is produced for a set of columns depends on how well the
 // observed statistics for that set of columns fit a linear regression model.
 // This means a forecast will not necessarily be produced for every set of
-// columns in the table. Any forecasts produced will have the same CreatedAt
-// time, which will be after the latest observed statistics (and could be in the
-// past, present, or future relative to the current time). Any forecasts
-// produced will not necessarily have the same RowCount or be consistent with
-// the other forecasts produced. (For example, DistinctCount in the forecast for
-// columns {a, b} could very well end up less than DistinctCount in the forecast
-// for column {a}.)
+// columns in the table. Any forecasts produced will not necessarily have the
+// same CreatedAt time (and could be in the past, present, or future relative to
+// the current time). Any forecasts produced will not necessarily have the same
+// RowCount or be consistent with the other forecasts produced. (For example,
+// DistinctCount in the forecast for columns {a, b} could very well end up less
+// than DistinctCount in the forecast for column {a}.)
 //
 // ForecastTableStatistics is deterministic: given the same observations it will
 // return the same forecasts.
 func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) []*TableStatistic {
 	// Group observed statistics by column set, skipping over partial statistics
 	// and statistics with inverted histograms.
-	var latest time.Time
 	var forecastCols []string
 	observedByCols := make(map[string][]*TableStatistic)
 	for _, stat := range observed {
@@ -81,9 +79,6 @@ func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) []
 			stat.HistogramData.ColumnType.Family() == types.BytesFamily {
 			continue
 		}
-		if latest.IsZero() {
-			latest = stat.CreatedAt
-		}
 		colKey := MakeSortedColStatKey(stat.ColumnIDs)
 		obs, ok := observedByCols[colKey]
 		if !ok {
@@ -92,18 +87,23 @@ func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) []
 		observedByCols[colKey] = append(obs, stat)
 	}
 
-	// To make forecasts deterministic, we must choose a time to forecast at based
-	// on only the observed statistics. We choose the time of the latest full
-	// statistics + the average time between automatic stats collections, which
-	// should be roughly when the next automatic stats collection will occur.
-	if latest.IsZero() {
+	if len(observedByCols) == 0 {
 		// No suitable stats.
 		return nil
 	}
-	at := latest.Add(avgFullRefreshTime(observed))
+	avgRefresh := avgFullRefreshTime(observed)
 
 	forecasts := make([]*TableStatistic, 0, len(forecastCols))
 	for _, colKey := range forecastCols {
+		// To make forecasts deterministic, we must choose a time to forecast at
+		// based on only the observed statistics. We choose the time of the latest
+		// statistics + the average time between automatic stats collections, which
+		// should be roughly when the next automatic stats collection will
+		// occur. This will be the same time for all columns that had the same
+		// latest collection time.
+		latest := observedByCols[colKey][0].CreatedAt
+		at := latest.Add(avgRefresh)
+
 		forecast, err := forecastColumnStatistics(ctx, observedByCols[colKey], at, minGoodnessOfFit)
 		if err != nil {
 			log.VEventf(
diff --git a/pkg/sql/stats/stats_cache.go b/pkg/sql/stats/stats_cache.go
index 05872b5b3b90..ec5aa8aa9e0d 100644
--- a/pkg/sql/stats/stats_cache.go
+++ b/pkg/sql/stats/stats_cache.go
@@ -14,6 +14,7 @@ import (
 	"context"
 	"fmt"
 	"math"
+	"sort"
 	"sync"
 
 	"github.com/cockroachdb/cockroach/pkg/clusterversion"
@@ -823,7 +824,12 @@ ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC
 
 	if forecast {
 		forecasts := ForecastTableStatistics(ctx, statsList)
-		statsList = append(forecasts, statsList...)
+		statsList = append(statsList, forecasts...)
+		// Some forecasts could have a CreatedAt time before or after some collected
+		// stats, so make sure the list is sorted in descending CreatedAt order.
+		sort.SliceStable(statsList, func(i, j int) bool {
+			return statsList[i].CreatedAt.After(statsList[j].CreatedAt)
+		})
 	}
 
 	return statsList, nil