Skip to content

Commit

Permalink
Merge #59326
Browse files Browse the repository at this point in the history
59326: opt: use histograms for inverted JSON/ARRAY scan statistics r=rytaft a=mgartner

#### memo: simplify statisticsBuilder.constrainScan

A `ScanExpr`, which includes an `InvertedConstraint`, is always passed
to `statisticsBuilder.constrainScan`, therefore there is no need to pass
the `InvertedConstraint` separately.

Release note: None

#### memo: rename multi-column-inverted-geo test file to inverted-geo-multi-column

Release note: None

#### opt: use histograms for inverted JSON/ARRAY scan statistics

Fixes #56870

Release note (performance improvement): The optimizer now uses collected
histograms statistics to better estimate the cost of JSON and ARRAY
inverted index scans, which may lead to more efficient query plans.


Co-authored-by: Marcus Gartner <[email protected]>
  • Loading branch information
craig[bot] and mgartner committed Jan 26, 2021
2 parents 6e26eb5 + a0f98f2 commit d86781c
Show file tree
Hide file tree
Showing 9 changed files with 765 additions and 332 deletions.
47 changes: 14 additions & 33 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@ import (
"reflect"

"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
"github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
Expand Down Expand Up @@ -657,7 +655,7 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
// selectivity, the inverted constraint selectivity, and the partial index
// predicate (if they exist) to the underlying table stats.
if scan.Constraint == nil || scan.Constraint.Spans.Count() < 2 {
sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
sb.constrainScan(scan, scan.Constraint, pred, relProps, s)
sb.finalizeFromCardinality(relProps)
return
}
Expand All @@ -681,17 +679,17 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa

// Get the stats for each span and union them together.
c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(0))
sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStatsUnion)
sb.constrainScan(scan, &c, pred, relProps, &spanStatsUnion)
for i, n := 1, scan.Constraint.Spans.Count(); i < n; i++ {
spanStats.CopyFrom(s)
c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(i))
sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStats)
sb.constrainScan(scan, &c, pred, relProps, &spanStats)
spanStatsUnion.UnionWith(&spanStats)
}

// Now that we have the correct row count, use the combined spans and the
// partial index predicate (if it exists) to get the correct column stats.
sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
sb.constrainScan(scan, scan.Constraint, pred, relProps, s)

// Copy in the row count and selectivity that were calculated above, if
// less than the values calculated from the combined spans.
Expand Down Expand Up @@ -720,7 +718,6 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
func (sb *statisticsBuilder) constrainScan(
scan *ScanExpr,
constraint *constraint.Constraint,
invertedConstraint invertedexpr.InvertedSpans,
pred FiltersExpr,
relProps *props.Relational,
s *props.Statistics,
Expand All @@ -731,18 +728,18 @@ func (sb *statisticsBuilder) constrainScan(

// Calculate distinct counts and histograms for inverted constrained columns
// -------------------------------------------------------------------------
if invertedConstraint != nil {
if scan.InvertedConstraint != nil {
// The constrained column is the virtual inverted column in the inverted
// index. Using scan.Cols here would also include the PK, which we don't
// want.
invertedConstrainedCol := scan.Table.ColumnID(idx.VirtualInvertedColumn().Ordinal())
constrainedCols.Add(invertedConstrainedCol)
colSet := opt.MakeColSet(invertedConstrainedCol)
if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
// TODO(mjibson): set distinctCount to something correct. Max is
// fine for now because ensureColStat takes the minimum of the
// passed value and colSet's distinct count.
const distinctCount = math.MaxFloat64
colSet := opt.MakeColSet(invertedConstrainedCol)
sb.ensureColStat(colSet, distinctCount, scan, s)

inputStat, _ := sb.colStatFromInput(colSet, scan)
Expand Down Expand Up @@ -783,7 +780,7 @@ func (sb *statisticsBuilder) constrainScan(
// TODO(mgartner): Remove this special case for JSON and ARRAY inverted
// indexes that are constrained by scan.Constraint once they are instead
// constrained by scan.InvertedConstraint.
if idx.IsInverted() && invertedConstraint == nil {
if idx.IsInverted() && scan.InvertedConstraint == nil {
for i, n := 0, constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
numUnappliedConjuncts += sb.numConjunctsInConstraint(constraint, i)
}
Expand Down Expand Up @@ -837,7 +834,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)

if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
colStat.Histogram = inputColStat.Histogram
}

Expand Down Expand Up @@ -2582,27 +2579,11 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
}
}

func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational, cols opt.ColSet) bool {
func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
// If we know that the cardinality is below a certain threshold (e.g., due to
// a constraint on a key column), don't bother adding the overhead of
// creating a histogram.
if relProps.Cardinality.Max < minCardinalityForHistogram {
return false
}
allowHist := true
cols.ForEach(func(col opt.ColumnID) {
colTyp := sb.md.ColumnMeta(col).Type
switch colTyp {
case types.Geometry, types.Geography:
// Special case these since ColumnTypeIsInvertedIndexable returns true for
// them, but they are supported in histograms now.
default:
if colinfo.ColumnTypeIsInvertedIndexable(colTyp) {
allowHist = false
}
}
})
return allowHist
return relProps.Cardinality.Max >= minCardinalityForHistogram
}

// rowsProcessed calculates and returns the number of rows processed by the
Expand Down Expand Up @@ -3052,7 +3033,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lowerBound)
}

if !sb.shouldUseHistogram(relProps, constrainedCols) {
if !sb.shouldUseHistogram(relProps) {
return constrainedCols, histCols
}

Expand Down Expand Up @@ -3107,12 +3088,12 @@ func (sb *statisticsBuilder) applyConstraintSet(
continue
}

cols := opt.MakeColSet(col)
if !sb.shouldUseHistogram(relProps, cols) {
if !sb.shouldUseHistogram(relProps) {
continue
}

// Calculate histogram.
cols := opt.MakeColSet(col)
if sb.updateHistogram(c, cols, e, s) {
histCols.UnionWith(cols)
}
Expand Down
130 changes: 130 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/inverted-array
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
exec-ddl
CREATE TABLE t (
k INT PRIMARY KEY,
a INT[],
INVERTED INDEX a_idx (a)
)
----

# Histogram boundaries are for JSON values `{}`, `{1}`, `{2}`, `{3}`. The
# row_count is lower than the sum of the histogram buckets num_eq's because some
# rows can have multiple inverted index entries, for example `{1, 2}`. There
# are:
#
# - 1000 rows total
# - 10 empty arrays
# - 990 arrays encoded into 1010 index entries
#
exec-ddl
ALTER TABLE t INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 3,
"null_count": 0,
"histo_col_type": "BYTES",
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x43"
},
{
"distinct_range": 0,
"num_eq": 990,
"num_range": 0,
"upper_bound": "\\x89"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8a"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8b"
}
]
}
]'
----

# Containment of an empty array requires a scan over all array entries.
opt
SELECT * FROM t@a_idx WHERE a @> '{}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["", ""]
├── stats: [rows=1020]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["", ""]
├── flags: force-index=a_idx
├── stats: [rows=1020, distinct(1)=1000, null(1)=0, distinct(4)=4, null(4)=0]
│ histogram(4)= 0 10 0 990 0 10 0 10
│ <--- '\x43' --- '\x89' --- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)

# An inverted index scan is preferred for a more selective filter.
opt
SELECT * FROM t WHERE a @> '{2}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=111.111111]
├── key: (1)
├── fd: (1)-->(2)
└── scan t@a_idx
├── columns: k:1(int!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8a"]
├── stats: [rows=10, distinct(4)=1, null(4)=0]
│ histogram(4)= 0 10 0 0
│ <--- '\x8a' --- '\x8b'
└── key: (1)

# A disjunction requires scanning all entries that match either the left or the
# right.
opt
SELECT * FROM t WHERE a @> '{2}' OR a @> '{3}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333, distinct(2)=3, null(2)=0]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["\x8a", "\x8c")
├── stats: [rows=20]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8c")
├── stats: [rows=20, distinct(1)=19.6078431, null(1)=0, distinct(4)=2, null(4)=0]
│ histogram(4)= 0 10 0 10
│ <--- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)
Loading

0 comments on commit d86781c

Please sign in to comment.