Skip to content

Commit

Permalink
opt: use histograms for inverted JSON/ARRAY scan statistics
Browse files Browse the repository at this point in the history
Fixes cockroachdb#56870

Release note (performance improvement): The optimizer now uses collected
histograms statistics to better estimate the cost of JSON and ARRAY
inverted index scans, which may lead to more efficient query plans.
  • Loading branch information
mgartner committed Jan 22, 2021
1 parent 5d96c92 commit 11a6a69
Show file tree
Hide file tree
Showing 8 changed files with 756 additions and 321 deletions.
30 changes: 8 additions & 22 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
"reflect"

"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
Expand Down Expand Up @@ -735,12 +734,12 @@ func (sb *statisticsBuilder) constrainScan(
// want.
invertedConstrainedCol := scan.Table.ColumnID(idx.VirtualInvertedColumn().Ordinal())
constrainedCols.Add(invertedConstrainedCol)
colSet := opt.MakeColSet(invertedConstrainedCol)
if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
// TODO(mjibson): set distinctCount to something correct. Max is
// fine for now because ensureColStat takes the minimum of the
// passed value and colSet's distinct count.
const distinctCount = math.MaxFloat64
colSet := opt.MakeColSet(invertedConstrainedCol)
sb.ensureColStat(colSet, distinctCount, scan, s)

inputStat, _ := sb.colStatFromInput(colSet, scan)
Expand Down Expand Up @@ -835,7 +834,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)

if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
colStat.Histogram = inputColStat.Histogram
}

Expand Down Expand Up @@ -2580,27 +2579,14 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
}
}

func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational, cols opt.ColSet) bool {
func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
// If we know that the cardinality is below a certain threshold (e.g., due to
// a constraint on a key column), don't bother adding the overhead of
// creating a histogram.
if relProps.Cardinality.Max < minCardinalityForHistogram {
return false
}
allowHist := true
cols.ForEach(func(col opt.ColumnID) {
colTyp := sb.md.ColumnMeta(col).Type
switch colTyp {
case types.Geometry, types.Geography:
// Special case these since ColumnTypeIsInvertedIndexable returns true for
// them, but they are supported in histograms now.
default:
if colinfo.ColumnTypeIsInvertedIndexable(colTyp) {
allowHist = false
}
}
})
return allowHist
return true
}

// rowsProcessed calculates and returns the number of rows processed by the
Expand Down Expand Up @@ -3050,7 +3036,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lowerBound)
}

if !sb.shouldUseHistogram(relProps, constrainedCols) {
if !sb.shouldUseHistogram(relProps) {
return constrainedCols, histCols
}

Expand Down Expand Up @@ -3105,12 +3091,12 @@ func (sb *statisticsBuilder) applyConstraintSet(
continue
}

cols := opt.MakeColSet(col)
if !sb.shouldUseHistogram(relProps, cols) {
if !sb.shouldUseHistogram(relProps) {
continue
}

// Calculate histogram.
cols := opt.MakeColSet(col)
if sb.updateHistogram(c, cols, e, s) {
histCols.UnionWith(cols)
}
Expand Down
130 changes: 130 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/inverted-array
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
exec-ddl
CREATE TABLE t (
k INT PRIMARY KEY,
a INT[],
INVERTED INDEX a_idx (a)
)
----

# Histogram boundaries are for JSON values `{}`, `{1}`, `{2}`, `{3}`. The
# row_count is lower than the sum of the histogram buckets num_eq's because some
# rows can have multiple inverted index entries, for example `{1, 2}`. There
# are:
#
# - 1000 rows total
# - 10 empty arrays
# - 990 arrays encoded into 1010 index entries
#
exec-ddl
ALTER TABLE t INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 3,
"null_count": 0,
"histo_col_type": "BYTES",
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x43"
},
{
"distinct_range": 0,
"num_eq": 990,
"num_range": 0,
"upper_bound": "\\x89"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8a"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8b"
}
]
}
]'
----

# Containment of an empty array requires a scan over all array entries.
opt
SELECT * FROM t@a_idx WHERE a @> '{}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["", ""]
├── stats: [rows=1020]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["", ""]
├── flags: force-index=a_idx
├── stats: [rows=1020, distinct(1)=1000, null(1)=0, distinct(4)=4, null(4)=0]
│ histogram(4)= 0 10 0 990 0 10 0 10
│ <--- '\x43' --- '\x89' --- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)

# An inverted index scan is preferred for a more selective filter.
opt
SELECT * FROM t WHERE a @> '{2}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=111.111111]
├── key: (1)
├── fd: (1)-->(2)
└── scan t@a_idx
├── columns: k:1(int!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8a"]
├── stats: [rows=10, distinct(4)=1, null(4)=0]
│ histogram(4)= 0 10 0 0
│ <--- '\x8a' --- '\x8b'
└── key: (1)

# A disjunction requires scanning all entries that match either the left or the
# right.
opt
SELECT * FROM t WHERE a @> '{2}' OR a @> '{3}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333, distinct(2)=3, null(2)=0]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["\x8a", "\x8c")
├── stats: [rows=20]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8c")
├── stats: [rows=20, distinct(1)=19.6078431, null(1)=0, distinct(4)=2, null(4)=0]
│ histogram(4)= 0 10 0 10
│ <--- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)
Loading

0 comments on commit 11a6a69

Please sign in to comment.