Merge #59326

59326: opt: use histograms for inverted JSON/ARRAY scan statistics r=rytaft a=mgartner #### memo: simplify statisticsBuilder.constrainScan A `ScanExpr`, which includes an `InvertedConstraint`, is always passed to `statisticsBuilder.constrainScan`, therefore there is no need to pass the `InvertedConstraint` separately. Release note: None #### memo: rename multi-column-inverted-geo test file to inverted-geo-multi-column Release note: None #### opt: use histograms for inverted JSON/ARRAY scan statistics Fixes #56870 Release note (performance improvement): The optimizer now uses collected histograms statistics to better estimate the cost of JSON and ARRAY inverted index scans, which may lead to more efficient query plans. Co-authored-by: Marcus Gartner <[email protected]>
cockroachdb · Jan 26, 2021 · d86781c · d86781c
2 parents 6e26eb5 + a0f98f2
commit d86781c
Show file tree

Hide file tree

Showing 9 changed files with 765 additions and 332 deletions.
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -15,10 +15,8 @@ import (
 	"reflect"
 
 	"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
-	"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
-	"github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
@@ -657,7 +655,7 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
 	// selectivity, the inverted constraint selectivity, and the partial index
 	// predicate (if they exist) to the underlying table stats.
 	if scan.Constraint == nil || scan.Constraint.Spans.Count() < 2 {
-		sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
+		sb.constrainScan(scan, scan.Constraint, pred, relProps, s)
 		sb.finalizeFromCardinality(relProps)
 		return
 	}
@@ -681,17 +679,17 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
 
 	// Get the stats for each span and union them together.
 	c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(0))
-	sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStatsUnion)
+	sb.constrainScan(scan, &c, pred, relProps, &spanStatsUnion)
 	for i, n := 1, scan.Constraint.Spans.Count(); i < n; i++ {
 		spanStats.CopyFrom(s)
 		c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(i))
-		sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStats)
+		sb.constrainScan(scan, &c, pred, relProps, &spanStats)
 		spanStatsUnion.UnionWith(&spanStats)
 	}
 
 	// Now that we have the correct row count, use the combined spans and the
 	// partial index predicate (if it exists) to get the correct column stats.
-	sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
+	sb.constrainScan(scan, scan.Constraint, pred, relProps, s)
 
 	// Copy in the row count and selectivity that were calculated above, if
 	// less than the values calculated from the combined spans.
@@ -720,7 +718,6 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
 func (sb *statisticsBuilder) constrainScan(
 	scan *ScanExpr,
 	constraint *constraint.Constraint,
-	invertedConstraint invertedexpr.InvertedSpans,
 	pred FiltersExpr,
 	relProps *props.Relational,
 	s *props.Statistics,
@@ -731,18 +728,18 @@ func (sb *statisticsBuilder) constrainScan(
 
 	// Calculate distinct counts and histograms for inverted constrained columns
 	// -------------------------------------------------------------------------
-	if invertedConstraint != nil {
+	if scan.InvertedConstraint != nil {
 		// The constrained column is the virtual inverted column in the inverted
 		// index. Using scan.Cols here would also include the PK, which we don't
 		// want.
 		invertedConstrainedCol := scan.Table.ColumnID(idx.VirtualInvertedColumn().Ordinal())
 		constrainedCols.Add(invertedConstrainedCol)
-		colSet := opt.MakeColSet(invertedConstrainedCol)
-		if sb.shouldUseHistogram(relProps, colSet) {
+		if sb.shouldUseHistogram(relProps) {
 			// TODO(mjibson): set distinctCount to something correct. Max is
 			// fine for now because ensureColStat takes the minimum of the
 			// passed value and colSet's distinct count.
 			const distinctCount = math.MaxFloat64
+			colSet := opt.MakeColSet(invertedConstrainedCol)
 			sb.ensureColStat(colSet, distinctCount, scan, s)
 
 			inputStat, _ := sb.colStatFromInput(colSet, scan)
@@ -783,7 +780,7 @@ func (sb *statisticsBuilder) constrainScan(
 		// TODO(mgartner): Remove this special case for JSON and ARRAY inverted
 		// indexes that are constrained by scan.Constraint once they are instead
 		// constrained by scan.InvertedConstraint.
-		if idx.IsInverted() && invertedConstraint == nil {
+		if idx.IsInverted() && scan.InvertedConstraint == nil {
 			for i, n := 0, constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
 				numUnappliedConjuncts += sb.numConjunctsInConstraint(constraint, i)
 			}
@@ -837,7 +834,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 	inputColStat := sb.colStatTable(scan.Table, colSet)
 	colStat := sb.copyColStat(colSet, s, inputColStat)
 
-	if sb.shouldUseHistogram(relProps, colSet) {
+	if sb.shouldUseHistogram(relProps) {
 		colStat.Histogram = inputColStat.Histogram
 	}
 
@@ -2582,27 +2579,11 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
 	}
 }
 
-func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational, cols opt.ColSet) bool {
+func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
 	// If we know that the cardinality is below a certain threshold (e.g., due to
 	// a constraint on a key column), don't bother adding the overhead of
 	// creating a histogram.
-	if relProps.Cardinality.Max < minCardinalityForHistogram {
-		return false
-	}
-	allowHist := true
-	cols.ForEach(func(col opt.ColumnID) {
-		colTyp := sb.md.ColumnMeta(col).Type
-		switch colTyp {
-		case types.Geometry, types.Geography:
-			// Special case these since ColumnTypeIsInvertedIndexable returns true for
-			// them, but they are supported in histograms now.
-		default:
-			if colinfo.ColumnTypeIsInvertedIndexable(colTyp) {
-				allowHist = false
-			}
-		}
-	})
-	return allowHist
+	return relProps.Cardinality.Max >= minCardinalityForHistogram
 }
 
 // rowsProcessed calculates and returns the number of rows processed by the
@@ -3052,7 +3033,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
 		sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lowerBound)
 	}
 
-	if !sb.shouldUseHistogram(relProps, constrainedCols) {
+	if !sb.shouldUseHistogram(relProps) {
 		return constrainedCols, histCols
 	}
 
@@ -3107,12 +3088,12 @@ func (sb *statisticsBuilder) applyConstraintSet(
 			continue
 		}
 
-		cols := opt.MakeColSet(col)
-		if !sb.shouldUseHistogram(relProps, cols) {
+		if !sb.shouldUseHistogram(relProps) {
 			continue
 		}
 
 		// Calculate histogram.
+		cols := opt.MakeColSet(col)
 		if sb.updateHistogram(c, cols, e, s) {
 			histCols.UnionWith(cols)
 		}

diff --git a/pkg/sql/opt/memo/testdata/stats/inverted-array b/pkg/sql/opt/memo/testdata/stats/inverted-array
@@ -0,0 +1,130 @@
+exec-ddl
+CREATE TABLE t (
+  k INT PRIMARY KEY,
+  a INT[],
+  INVERTED INDEX a_idx (a)
+)
+----
+
+# Histogram boundaries are for JSON values `{}`, `{1}`, `{2}`, `{3}`. The
+# row_count is lower than the sum of the histogram buckets num_eq's because some
+# rows can have multiple inverted index entries, for example `{1, 2}`. There
+# are:
+#
+#   - 1000 rows total
+#   - 10 empty arrays
+#   - 990 arrays encoded into 1010 index entries
+#
+exec-ddl
+ALTER TABLE t INJECT STATISTICS '[
+  {
+    "columns": ["a"],
+    "created_at": "2018-01-01 1:00:00.00000+00:00",
+    "row_count": 1000,
+    "distinct_count": 3,
+    "null_count": 0,
+    "histo_col_type": "BYTES",
+    "histo_buckets": [
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x43"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 990,
+        "num_range": 0,
+        "upper_bound": "\\x89"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x8a"
+      },
+      {
+        "distinct_range": 0,
+        "num_eq": 10,
+        "num_range": 0,
+        "upper_bound": "\\x8b"
+      }
+    ]
+  }
+]'
+----
+
+# Containment of an empty array requires a scan over all array entries.
+opt
+SELECT * FROM t@a_idx WHERE a @> '{}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=333.333333]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── inverted-filter
+      ├── columns: k:1(int!null)
+      ├── inverted expression: /4
+      │    ├── tight: true, unique: false
+      │    └── union spans: ["", ""]
+      ├── stats: [rows=1020]
+      ├── key: (1)
+      └── scan t@a_idx
+           ├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
+           ├── inverted constraint: /4/1
+           │    └── spans: ["", ""]
+           ├── flags: force-index=a_idx
+           ├── stats: [rows=1020, distinct(1)=1000, null(1)=0, distinct(4)=4, null(4)=0]
+           │   histogram(4)=  0    10    0   990    0    10    0    10
+           │                <--- '\x43' --- '\x89' --- '\x8a' --- '\x8b'
+           ├── key: (1)
+           └── fd: (1)-->(4)
+
+# An inverted index scan is preferred for a more selective filter.
+opt
+SELECT * FROM t WHERE a @> '{2}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=111.111111]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── scan t@a_idx
+      ├── columns: k:1(int!null)
+      ├── inverted constraint: /4/1
+      │    └── spans: ["\x8a", "\x8a"]
+      ├── stats: [rows=10, distinct(4)=1, null(4)=0]
+      │   histogram(4)=  0    10    0    0
+      │                <--- '\x8a' --- '\x8b'
+      └── key: (1)
+
+# A disjunction requires scanning all entries that match either the left or the
+# right.
+opt
+SELECT * FROM t WHERE a @> '{2}' OR a @> '{3}'
+----
+index-join t
+ ├── columns: k:1(int!null) a:2(int[]!null)
+ ├── immutable
+ ├── stats: [rows=333.333333, distinct(2)=3, null(2)=0]
+ ├── key: (1)
+ ├── fd: (1)-->(2)
+ └── inverted-filter
+      ├── columns: k:1(int!null)
+      ├── inverted expression: /4
+      │    ├── tight: true, unique: false
+      │    └── union spans: ["\x8a", "\x8c")
+      ├── stats: [rows=20]
+      ├── key: (1)
+      └── scan t@a_idx
+           ├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
+           ├── inverted constraint: /4/1
+           │    └── spans: ["\x8a", "\x8c")
+           ├── stats: [rows=20, distinct(1)=19.6078431, null(1)=0, distinct(4)=2, null(4)=0]
+           │   histogram(4)=  0    10    0    10
+           │                <--- '\x8a' --- '\x8b'
+           ├── key: (1)
+           └── fd: (1)-->(4)
diff --git a/.../testdata/stats/multi-column-inverted-geo → .../testdata/stats/inverted-geo-multi-column b/.../testdata/stats/multi-column-inverted-geo → .../testdata/stats/inverted-geo-multi-column