opt: use histograms for inverted JSON/ARRAY scan statistics

Fixes cockroachdb#56870 Release note (performance improvement): The optimizer now uses collected histograms statistics to better estimate the cost of JSON and ARRAY inverted index scans, which may lead to more efficient query plans.
mgartner · Jan 22, 2021 · 11a6a69 · 11a6a69
1 parent 5d96c92
commit 11a6a69
Show file tree

Hide file tree

Showing 8 changed files with 756 additions and 321 deletions.
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -15,7 +15,6 @@ import (
 	"reflect"
 
 	"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
-	"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
@@ -735,12 +734,12 @@ func (sb *statisticsBuilder) constrainScan(
 		// want.
 		invertedConstrainedCol := scan.Table.ColumnID(idx.VirtualInvertedColumn().Ordinal())
 		constrainedCols.Add(invertedConstrainedCol)
-		colSet := opt.MakeColSet(invertedConstrainedCol)
-		if sb.shouldUseHistogram(relProps, colSet) {
+		if sb.shouldUseHistogram(relProps) {
 			// TODO(mjibson): set distinctCount to something correct. Max is
 			// fine for now because ensureColStat takes the minimum of the
 			// passed value and colSet's distinct count.
 			const distinctCount = math.MaxFloat64
+			colSet := opt.MakeColSet(invertedConstrainedCol)
 			sb.ensureColStat(colSet, distinctCount, scan, s)
 
 			inputStat, _ := sb.colStatFromInput(colSet, scan)
@@ -835,7 +834,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 	inputColStat := sb.colStatTable(scan.Table, colSet)
 	colStat := sb.copyColStat(colSet, s, inputColStat)
 
-	if sb.shouldUseHistogram(relProps, colSet) {
+	if sb.shouldUseHistogram(relProps) {
 		colStat.Histogram = inputColStat.Histogram
 	}
 
@@ -2580,27 +2579,14 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
 	}
 }
 
-func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational, cols opt.ColSet) bool {
+func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
 	// If we know that the cardinality is below a certain threshold (e.g., due to
 	// a constraint on a key column), don't bother adding the overhead of
 	// creating a histogram.
 	if relProps.Cardinality.Max < minCardinalityForHistogram {
 		return false
 	}
-	allowHist := true
-	cols.ForEach(func(col opt.ColumnID) {
-		colTyp := sb.md.ColumnMeta(col).Type
-		switch colTyp {
-		case types.Geometry, types.Geography:
-			// Special case these since ColumnTypeIsInvertedIndexable returns true for
-			// them, but they are supported in histograms now.
-		default:
-			if colinfo.ColumnTypeIsInvertedIndexable(colTyp) {
-				allowHist = false
-			}
-		}
-	})
-	return allowHist
+	return true
 }
 
 // rowsProcessed calculates and returns the number of rows processed by the
@@ -3050,7 +3036,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
 		sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lowerBound)
 	}
 
-	if !sb.shouldUseHistogram(relProps, constrainedCols) {
+	if !sb.shouldUseHistogram(relProps) {
 		return constrainedCols, histCols
 	}
 
@@ -3105,12 +3091,12 @@ func (sb *statisticsBuilder) applyConstraintSet(
 			continue
 		}
 
-		cols := opt.MakeColSet(col)
-		if !sb.shouldUseHistogram(relProps, cols) {
+		if !sb.shouldUseHistogram(relProps) {
 			continue
 		}
 
 		// Calculate histogram.
+		cols := opt.MakeColSet(col)
 		if sb.updateHistogram(c, cols, e, s) {
 			histCols.UnionWith(cols)
 		}

diff --git a/pkg/sql/opt/memo/testdata/stats/inverted-array b/pkg/sql/opt/memo/testdata/stats/inverted-array
@@ -0,0 +1,130 @@
+exec-ddl
+CREATE TABLE t (
+  k INT PRIMARY KEY,
+  a INT[],
+  INVERTED INDEX a_idx (a)
+)
+----
+
+# Histogram boundaries are for JSON values `{}`, `{1}`, `{2}`, `{3}`. The
+# row_count is lower than the sum of the histogram buckets num_eq's because some
+# rows can have multiple inverted index entries, for example `{1, 2}`. There
+# are:
+#
+#   - 1000 rows total
+#   - 10 empty arrays
+#   - 990 arrays encoded into 1010 index entries
+#
+exec-ddl
+ALTER TABLE t INJECT STATISTICS '[
+  {
+    "columns": ["a"],
+    "created_at": "2018-01-01 1:00:00.00000+00:00",
+    "row_count": 1000,
+    "distinct_count": 3,
+    "null_count": 0,
+    "histo_col_type": "BYTES",
+    "histo_buckets": [
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x43"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 990,
+        "num_range": 0,
+        "upper_bound": "\\x89"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x8a"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x8b"
+      }
+    ]
+  }
+]'
+----
+
+# Containment of an empty array requires a scan over all array entries.
+opt
+SELECT * FROM t@a_idx WHERE a @> '{}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=333.333333]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── inverted-filter
+      ├── columns: k:1(int!null)
+      ├── inverted expression: /4
+      │    ├── tight: true, unique: false
+      │    └── union spans: ["", ""]
+      ├── stats: [rows=1020]
+      ├── key: (1)
+      └── scan t@a_idx
+           ├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
+           ├── inverted constraint: /4/1
+           │    └── spans: ["", ""]
+           ├── flags: force-index=a_idx
+           ├── stats: [rows=1020, distinct(1)=1000, null(1)=0, distinct(4)=4, null(4)=0]
+           │   histogram(4)=  0    10    0   990    0    10    0    10
+           │                <--- '\x43' --- '\x89' --- '\x8a' --- '\x8b'
+           ├── key: (1)
+           └── fd: (1)-->(4)
+
+# An inverted index scan is preferred for a more selective filter.
+opt
+SELECT * FROM t WHERE a @> '{2}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=111.111111]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── scan t@a_idx
+      ├── columns: k:1(int!null)
+      ├── inverted constraint: /4/1
+      │    └── spans: ["\x8a", "\x8a"]
+      ├── stats: [rows=10, distinct(4)=1, null(4)=0]
+      │   histogram(4)=  0    10    0    0
+      │                <--- '\x8a' --- '\x8b'
+      └── key: (1)
+
+# A disjunction requires scanning all entries that match either the left or the
+# right.
+opt
+SELECT * FROM t WHERE a @> '{2}' OR a @> '{3}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=333.333333, distinct(2)=3, null(2)=0]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── inverted-filter
+      ├── columns: k:1(int!null)
+      ├── inverted expression: /4
+      │    ├── tight: true, unique: false
+      │    └── union spans: ["\x8a", "\x8c")
+      ├── stats: [rows=20]
+      ├── key: (1)
+      └── scan t@a_idx
+           ├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
+           ├── inverted constraint: /4/1
+           │    └── spans: ["\x8a", "\x8c")
+           ├── stats: [rows=20, distinct(1)=19.6078431, null(1)=0, distinct(4)=2, null(4)=0]
+           │   histogram(4)=  0    10    0    10
+           │                <--- '\x8a' --- '\x8b'
+           ├── key: (1)
+           └── fd: (1)-->(4)