opt: support BYTES for histogram range calculations

Fixes #68346 Release note (performance improvement): The accuracy of histogram calculations for BYTES types has been improved. As a result, the optimizer should generate more efficient query plans in some cases.
cockroachdb · Aug 11, 2021 · e2ae5b2 · e2ae5b2
1 parent 6314f90
commit e2ae5b2
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 10 deletions.
diff --git a/pkg/sql/opt/constraint/testutils.go b/pkg/sql/opt/constraint/testutils.go
@@ -151,6 +151,8 @@ func parseDatumPath(evalCtx *tree.EvalContext, str string, typs []types.Family)
 			val, _, err = tree.ParseDTimestampTZ(evalCtx, valStr, time.Microsecond)
 		case types.StringFamily:
 			val = tree.NewDString(valStr)
+		case types.BytesFamily:
+			val = tree.NewDBytes(tree.DBytes(valStr))
 		case types.OidFamily:
 			dInt, err := tree.ParseDInt(valStr)
 			if err == nil {

diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
@@ -800,9 +800,10 @@ func getRangesBeforeAndAfter(
 	) (rngBefore, rngAfter float64, ok bool) {
 
 		// Utilizes an array to simplify number of repetitive calls.
-		boundArr := []tree.Datum{lowerBoundBefore, upperBoundBefore, lowerBoundAfter,
-			upperBoundAfter}
-		boundArrByte := make([][]byte, 4)
+		boundArr := [4]tree.Datum{
+			lowerBoundBefore, upperBoundBefore, lowerBoundAfter, upperBoundAfter,
+		}
+		var boundArrByte [4][]byte
 
 		for i := range boundArr {
 			var err error
@@ -857,7 +858,7 @@ func isDiscrete(typ *types.T) bool {
 // cockroach db.
 func isNonNumeric(typ *types.T) bool {
 	switch typ.Family() {
-	case types.StringFamily, types.UuidFamily, types.INetFamily:
+	case types.StringFamily, types.BytesFamily, types.UuidFamily, types.INetFamily:
 		return true
 	}
 	return false
@@ -866,12 +867,7 @@ func isNonNumeric(typ *types.T) bool {
 // getCommonPrefix returns the first index where the value at said index differs
 // across all byte arrays in byteArr. byteArr must contain at least one element
 // to compute a common prefix.
-func getCommonPrefix(byteArr [][]byte) int {
-
-	if len(byteArr) <= 0 {
-		panic(errors.AssertionFailedf("byteArr must have at least one element"))
-	}
-
+func getCommonPrefix(byteArr [4][]byte) int {
 	// Checks if the current value at index is the same between all byte arrays.
 	currIndMatching := func(ind int) bool {
 		for i := 0; i < len(byteArr); i++ {

diff --git a/pkg/sql/opt/props/histogram_test.go b/pkg/sql/opt/props/histogram_test.go
@@ -707,6 +707,69 @@ func TestFilterBucket(t *testing.T) {
 		runTest(h3, t3, types.StringFamily)
 	})
 
+	t.Run("bytes", func(t *testing.T) {
+		h1 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
+			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(tree.NewDBytes("bear"))},
+			{NumEq: 5, NumRange: 10, DistinctRange: 10, UpperBound: tree.NewDBytes("bobcat")},
+		}}
+		h2 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
+			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(tree.NewDBytes("a"))},
+			{NumEq: 5, NumRange: 10, DistinctRange: 10, UpperBound: tree.NewDBytes("c")},
+		}}
+		h3 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
+			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(tree.NewDBytes("aaaaaaaaaaaa"))},
+			{NumEq: 5, NumRange: 10, DistinctRange: 10, UpperBound: tree.NewDBytes("cccccccccccc")},
+		}}
+
+		t1 := []testCase{
+			{
+				span:     "[/bluejay - /boar]",
+				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 2.92, DistinctRange: 2.92, UpperBound: tree.NewDBytes("boar")},
+			},
+			{
+				span:     "[/beer - /bobcat]",
+				expected: &cat.HistogramBucket{NumEq: 5, NumRange: 9.98, DistinctRange: 9.98, UpperBound: tree.NewDBytes("bobcat")},
+			},
+		}
+
+		t2 := []testCase{
+			// Within the CRDB encoding, all null bytes are followed by an escape byte,
+			// (255) which are left in for the rangeAfter calculations. For this
+			// reason, the resulting NumRange is slightly lower than expected at 4.99
+			// instead of 5.
+			{
+				span:     "[/a\x00 - /b]",
+				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 4.99, DistinctRange: 4.99, UpperBound: tree.NewDBytes("b")},
+			},
+			{
+				span:     "[/as - /b]",
+				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 2.76, DistinctRange: 2.76, UpperBound: tree.NewDBytes("b")},
+			},
+			{
+				span:     "[/as - /c]",
+				expected: &cat.HistogramBucket{NumEq: 5, NumRange: 7.77, DistinctRange: 7.77, UpperBound: tree.NewDBytes("c")},
+			},
+			{
+				span:     "[/bs - /c]",
+				expected: &cat.HistogramBucket{NumEq: 5, NumRange: 2.76, DistinctRange: 2.76, UpperBound: tree.NewDBytes("c")},
+			},
+		}
+
+		// The initial 8 bytes for lowerBound and upperBound of the span is the same.
+		// Hence, the resulting NumRange/DistinctRange should be 0, as rangeAfter
+		// only considers the first 8 bytes of the bounds.
+		t3 := []testCase{
+			{
+				span:     "[/aaaaaaaabbbb - /aaaaaaaacccc]",
+				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDBytes("aaaaaaaacccc")},
+			},
+		}
+
+		runTest(h1, t1, types.BytesFamily)
+		runTest(h2, t2, types.BytesFamily)
+		runTest(h3, t3, types.BytesFamily)
+	})
+
 	t.Run("uuid", func(t *testing.T) {
 		l1, err := tree.ParseDUuidFromString("2189ad07-52f2-4d60-83e8-4a8347fef718")
 		if err != nil {