From fbcd656a2936e25f10643d8a27285542537b534c Mon Sep 17 00:00:00 2001
From: Andy Yang <yang@cockroachlabs.com>
Date: Wed, 2 Aug 2023 17:36:04 -0400
Subject: [PATCH] sql/sem/builtins: deflake
 TestSerialNormalizationWithUniqueUnorderedID

This patch changes the test to run its main logic multiple times
(currently set to 10) and only fail due to non-uniformity if we see
statistically significant non-uniformity in a majority of the runs.

For context, an occasional failure is not unexpected given this is a
statistical test, but multiple failures in a row would be more
suggestive of an actual problem with the uniformity of generated IDs.

Release note: None
---
 pkg/sql/sem/builtins/BUILD.bazel      |   2 -
 pkg/sql/sem/builtins/builtins_test.go | 164 ++++++++++++++------------
 2 files changed, 88 insertions(+), 78 deletions(-)

diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel
index ec9ca5ac2607..5292399d19b0 100644
--- a/pkg/sql/sem/builtins/BUILD.bazel
+++ b/pkg/sql/sem/builtins/BUILD.bazel
@@ -214,10 +214,8 @@ go_test(
         "//pkg/util/log",
         "//pkg/util/mon",
         "//pkg/util/randutil",
-        "//pkg/util/retry",
         "//pkg/util/syncutil",
         "//pkg/util/timeutil",
-        "@com_github_cockroachdb_errors//:errors",
         "@com_github_lib_pq//:pq",
         "@com_github_lib_pq//oid",
         "@com_github_stretchr_testify//assert",
diff --git a/pkg/sql/sem/builtins/builtins_test.go b/pkg/sql/sem/builtins/builtins_test.go
index 4c55e0581108..8517601884c3 100644
--- a/pkg/sql/sem/builtins/builtins_test.go
+++ b/pkg/sql/sem/builtins/builtins_test.go
@@ -14,6 +14,7 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"math"
 	"math/bits"
 	"math/rand"
 	"testing"
@@ -32,9 +33,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/duration"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
-	"github.com/cockroachdb/cockroach/pkg/util/retry"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
-	"github.com/cockroachdb/errors"
 	"github.com/lib/pq"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -97,8 +96,8 @@ func TestMapToUniqueUnorderedID(t *testing.T) {
 }
 
 // TestSerialNormalizationWithUniqueUnorderedID makes sure that serial
-// normalization can use unique_unordered_id() and a split in a table followed
-// by insertions guarantees a (somewhat) uniform distribution of the data.
+// normalization can use the unordered_rowid mode and a large number of
+// insertions (80k) results in a (somewhat) uniform distribution of the data.
 func TestSerialNormalizationWithUniqueUnorderedID(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
@@ -107,25 +106,33 @@ func TestSerialNormalizationWithUniqueUnorderedID(t *testing.T) {
 		"assumes large N")
 
 	ctx := context.Background()
-	const attempts = 2
-	// This test can flake when the random data is not distributed evenly.
-	err := retry.WithMaxAttempts(ctx, retry.Options{}, attempts, func() error {
-
-		params := base.TestServerArgs{
-			Knobs: base.TestingKnobs{
-				SQLEvalContext: &eval.TestingKnobs{
-					// We disable the randomization of some batch sizes because
-					// with some low values the test takes much longer.
-					ForceProductionValues: true,
+
+	// Since this is a statistical test, an occasional observation of non-uniformity
+	// is not in and of itself a huge concern. As such, we'll only fail the test if
+	// a majority of our observations show statistically significant amounts of
+	// non-uniformity.
+	const numObservations = 10
+	numNonUniformObservations := 0
+	for i := 0; i < numObservations; i++ {
+		// We use an anonymous function because of the defer in each iteration.
+		func() {
+			t.Logf("starting observation %d", i)
+
+			params := base.TestServerArgs{
+				Knobs: base.TestingKnobs{
+					SQLEvalContext: &eval.TestingKnobs{
+						// We disable the randomization of some batch sizes because
+						// with some low values the test takes much longer.
+						ForceProductionValues: true,
+					},
 				},
-			},
-		}
-		s, db, _ := serverutils.StartServer(t, params)
-		defer s.Stopper().Stop(ctx)
+			}
+			s, db, _ := serverutils.StartServer(t, params)
+			defer s.Stopper().Stop(ctx)
 
-		tdb := sqlutils.MakeSQLRunner(db)
-		// Create a new table with serial primary key i (unordered_rowid) and int j (index).
-		tdb.Exec(t, `
+			tdb := sqlutils.MakeSQLRunner(db)
+			// Create a new table with serial primary key i (unordered_rowid) and int j (index).
+			tdb.Exec(t, `
 SET serial_normalization TO 'unordered_rowid';
 CREATE DATABASE t;
 USE t;
@@ -134,30 +141,29 @@ CREATE TABLE t (
   j INT
 )`)
 
-		numberOfRows := 100000
-
-		// Insert rows.
-		tdb.Exec(t, fmt.Sprintf(`
+			// Insert rows.
+			numberOfRows := 80000
+			tdb.Exec(t, fmt.Sprintf(`
 INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
 `, numberOfRows))
 
-		// Build an equi-width histogram over the key range. The below query will
-		// generate the key bounds for each high-order bit pattern as defined by
-		// prefixBits. For example, if this were 3, we'd get the following groups:
-		//
-		//            low         |        high
-		//  ----------------------+----------------------
-		//                      0 | 2305843009213693952
-		//    2305843009213693952 | 4611686018427387904
-		//    4611686018427387904 | 6917529027641081856
-		//    6917529027641081856 | 9223372036854775807
-		//
-		const prefixBits = 4
-		var keyCounts pq.Int64Array
-		tdb.QueryRow(t, `
+			// Build an equi-width histogram over the key range. The below query will
+			// generate the key bounds for each high-order bit pattern as defined by
+			// prefixBits. For example, if this were 3, we'd get the following groups:
+			//
+			//            low         |        high
+			//  ----------------------+----------------------
+			//                      0 | 2305843009213693952
+			//    2305843009213693952 | 4611686018427387904
+			//    4611686018427387904 | 6917529027641081856
+			//    6917529027641081856 | 9223372036854775807
+			//
+			const prefixBits = 4
+			var keyCounts pq.Int64Array
+			tdb.QueryRow(t, `
   WITH boundaries AS (
-                     SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1<<($1-1)) -1)) AS t (i)
-                     UNION ALL SELECT (((1 << 62) - 1) << 1)+1 -- int63 max value
+                     SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1 << ($1 - 1)) - 1)) AS t (i)
+                     UNION ALL SELECT (((1 << 62) - 1) << 1) + 1 -- int63 max value
                   ),
        groups AS (
                 SELECT *
@@ -167,55 +173,61 @@ INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
        counts AS (
                   SELECT count(i) AS c
                     FROM t, groups
-                   WHERE low <= i AND high > i
+                   WHERE low < i AND i <= high
                 GROUP BY (low, high)
               )
 SELECT array_agg(c)
   FROM counts;`, prefixBits).Scan(&keyCounts)
 
-		t.Log("Key counts in each split range")
-		for i, keyCount := range keyCounts {
-			t.Logf("range %d: %d\n", i, keyCount)
-		}
-		require.Len(t, keyCounts, 1<<(prefixBits-1))
-
-		// To check that the distribution over ranges is not uniform, we use a
-		// chi-square goodness of fit statistic. We'll set our null hypothesis as
-		// 'each range in the distribution should have the same probability of getting
-		// a row inserted' and we'll check if we can reject the null hypothesis if
-		// chi-square is greater than the critical value we currently set as 35.2585,
-		// a deliberate choice that gives us a p-value of 0.00001 according to
-		// https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
-		// able to reject the null hypothesis, then the distribution is not uniform,
-		// and we raise an error. This test has 7 degrees of freedom (categories - 1).
-		chiSquared := discreteUniformChiSquared(keyCounts)
-		criticalValue := 35.2585
-		if chiSquared >= criticalValue {
-			return errors.Newf("chiSquared value of %f must be"+
-				" less than criticalVal %f to guarantee distribution is relatively uniform",
-				chiSquared, criticalValue)
-		}
-		return nil
-	})
-	require.NoError(t, err)
+			t.Log("Key counts in each split range")
+			for i, keyCount := range keyCounts {
+				t.Logf("range %d: %d\n", i, keyCount)
+			}
+			require.Len(t, keyCounts, 1<<(prefixBits-1))
+
+			// To check that the distribution over ranges is not uniform, we use a
+			// chi-square goodness of fit statistic. We'll set our null hypothesis as
+			// 'each range in the distribution should have the same probability of getting
+			// a row inserted' and we'll check if we can reject the null hypothesis if
+			// chi-squared is greater than the critical value we currently set as 35.2585,
+			// a deliberate choice that gives us a p-value of 0.00001 according to
+			// https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
+			// able to reject the null hypothesis, then the distribution is not uniform,
+			// and we raise an error. This test has 7 degrees of freedom (categories - 1).
+			chiSquared := discreteUniformChiSquared(keyCounts)
+			criticalValue := 35.2585
+			if chiSquared > criticalValue {
+				t.Logf("chiSquared value of %f > criticalVal %f indicates that the distribution "+
+					"is non-uniform (statistically significant, p < 0.00001)",
+					chiSquared, criticalValue)
+				numNonUniformObservations += 1
+			} else {
+				t.Logf("chiSquared value of %f <= criticalVal %f indicates that the distribution "+
+					"is relatively uniform",
+					chiSquared, criticalValue)
+			}
+		}()
+	}
+
+	if numNonUniformObservations >= numObservations/2 {
+		t.Fatalf("a majority of our observations indicate the distribution is non-uniform")
+	}
 }
 
 // discreteUniformChiSquared calculates the chi-squared statistic (ref:
 // https://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm) to be used
 // in our hypothesis testing for the distribution of rows among ranges.
-func discreteUniformChiSquared(counts []int64) float64 {
+func discreteUniformChiSquared(buckets []int64) float64 {
 	var n int64
-	for _, c := range counts {
+	for _, c := range buckets {
 		n += c
 	}
-	p := float64(1) / float64(len(counts))
-	var stat float64
-	for _, c := range counts {
-		oSubE := float64(c)/float64(n) - p
-		stat += (oSubE * oSubE) / p
+	expectedPerBucket := float64(n) / float64(len(buckets))
+	var chiSquared float64
+	for _, c := range buckets {
+		chiSquared += math.Pow(float64(c)-expectedPerBucket, 2) / expectedPerBucket
 	}
-	stat *= float64(n)
-	return stat
+	return chiSquared
 }
 
 func TestStringToArrayAndBack(t *testing.T) {