Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

opt: use histograms for inverted JSON/ARRAY scan statistics #59326

Merged
merged 3 commits into from
Jan 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@ import (
"reflect"

"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
"github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
Expand Down Expand Up @@ -657,7 +655,7 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
// selectivity, the inverted constraint selectivity, and the partial index
// predicate (if they exist) to the underlying table stats.
if scan.Constraint == nil || scan.Constraint.Spans.Count() < 2 {
sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
sb.constrainScan(scan, scan.Constraint, pred, relProps, s)
sb.finalizeFromCardinality(relProps)
return
}
Expand All @@ -681,17 +679,17 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa

// Get the stats for each span and union them together.
c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(0))
sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStatsUnion)
sb.constrainScan(scan, &c, pred, relProps, &spanStatsUnion)
for i, n := 1, scan.Constraint.Spans.Count(); i < n; i++ {
spanStats.CopyFrom(s)
c.InitSingleSpan(&keyCtx, scan.Constraint.Spans.Get(i))
sb.constrainScan(scan, &c, scan.InvertedConstraint, pred, relProps, &spanStats)
sb.constrainScan(scan, &c, pred, relProps, &spanStats)
spanStatsUnion.UnionWith(&spanStats)
}

// Now that we have the correct row count, use the combined spans and the
// partial index predicate (if it exists) to get the correct column stats.
sb.constrainScan(scan, scan.Constraint, scan.InvertedConstraint, pred, relProps, s)
sb.constrainScan(scan, scan.Constraint, pred, relProps, s)

// Copy in the row count and selectivity that were calculated above, if
// less than the values calculated from the combined spans.
Expand Down Expand Up @@ -720,7 +718,6 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
func (sb *statisticsBuilder) constrainScan(
scan *ScanExpr,
constraint *constraint.Constraint,
invertedConstraint invertedexpr.InvertedSpans,
pred FiltersExpr,
relProps *props.Relational,
s *props.Statistics,
Expand All @@ -731,18 +728,18 @@ func (sb *statisticsBuilder) constrainScan(

// Calculate distinct counts and histograms for inverted constrained columns
// -------------------------------------------------------------------------
if invertedConstraint != nil {
if scan.InvertedConstraint != nil {
// The constrained column is the virtual inverted column in the inverted
// index. Using scan.Cols here would also include the PK, which we don't
// want.
invertedConstrainedCol := scan.Table.ColumnID(idx.VirtualInvertedColumn().Ordinal())
constrainedCols.Add(invertedConstrainedCol)
colSet := opt.MakeColSet(invertedConstrainedCol)
if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
// TODO(mjibson): set distinctCount to something correct. Max is
// fine for now because ensureColStat takes the minimum of the
// passed value and colSet's distinct count.
const distinctCount = math.MaxFloat64
colSet := opt.MakeColSet(invertedConstrainedCol)
sb.ensureColStat(colSet, distinctCount, scan, s)

inputStat, _ := sb.colStatFromInput(colSet, scan)
Expand Down Expand Up @@ -783,7 +780,7 @@ func (sb *statisticsBuilder) constrainScan(
// TODO(mgartner): Remove this special case for JSON and ARRAY inverted
// indexes that are constrained by scan.Constraint once they are instead
// constrained by scan.InvertedConstraint.
if idx.IsInverted() && invertedConstraint == nil {
if idx.IsInverted() && scan.InvertedConstraint == nil {
for i, n := 0, constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
numUnappliedConjuncts += sb.numConjunctsInConstraint(constraint, i)
}
Expand Down Expand Up @@ -837,7 +834,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)

if sb.shouldUseHistogram(relProps, colSet) {
if sb.shouldUseHistogram(relProps) {
colStat.Histogram = inputColStat.Histogram
}

Expand Down Expand Up @@ -2582,27 +2579,11 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
}
}

func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational, cols opt.ColSet) bool {
func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
// If we know that the cardinality is below a certain threshold (e.g., due to
// a constraint on a key column), don't bother adding the overhead of
// creating a histogram.
if relProps.Cardinality.Max < minCardinalityForHistogram {
return false
}
allowHist := true
cols.ForEach(func(col opt.ColumnID) {
colTyp := sb.md.ColumnMeta(col).Type
switch colTyp {
case types.Geometry, types.Geography:
// Special case these since ColumnTypeIsInvertedIndexable returns true for
// them, but they are supported in histograms now.
default:
if colinfo.ColumnTypeIsInvertedIndexable(colTyp) {
allowHist = false
}
}
})
return allowHist
return relProps.Cardinality.Max >= minCardinalityForHistogram
}

// rowsProcessed calculates and returns the number of rows processed by the
Expand Down Expand Up @@ -3052,7 +3033,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lowerBound)
}

if !sb.shouldUseHistogram(relProps, constrainedCols) {
if !sb.shouldUseHistogram(relProps) {
return constrainedCols, histCols
}

Expand Down Expand Up @@ -3107,12 +3088,12 @@ func (sb *statisticsBuilder) applyConstraintSet(
continue
}

cols := opt.MakeColSet(col)
if !sb.shouldUseHistogram(relProps, cols) {
if !sb.shouldUseHistogram(relProps) {
continue
}

// Calculate histogram.
cols := opt.MakeColSet(col)
if sb.updateHistogram(c, cols, e, s) {
histCols.UnionWith(cols)
}
Expand Down
130 changes: 130 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/inverted-array
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
exec-ddl
CREATE TABLE t (
k INT PRIMARY KEY,
a INT[],
INVERTED INDEX a_idx (a)
)
----

# Histogram boundaries are for JSON values `{}`, `{1}`, `{2}`, `{3}`. The
# row_count is lower than the sum of the histogram buckets num_eq's because some
# rows can have multiple inverted index entries, for example `{1, 2}`. There
# are:
#
# - 1000 rows total
# - 10 empty arrays
# - 990 arrays encoded into 1010 index entries
#
exec-ddl
ALTER TABLE t INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 3,
"null_count": 0,
"histo_col_type": "BYTES",
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x43"
},
{
"distinct_range": 0,
"num_eq": 990,
"num_range": 0,
"upper_bound": "\\x89"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8a"
},
{
"distinct_range": 0,
"num_eq": 10,
"num_range": 0,
"upper_bound": "\\x8b"
}
]
}
]'
----

# Containment of an empty array requires a scan over all array entries.
opt
SELECT * FROM t@a_idx WHERE a @> '{}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["", ""]
├── stats: [rows=1020]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["", ""]
├── flags: force-index=a_idx
├── stats: [rows=1020, distinct(1)=1000, null(1)=0, distinct(4)=4, null(4)=0]
│ histogram(4)= 0 10 0 990 0 10 0 10
│ <--- '\x43' --- '\x89' --- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)

# An inverted index scan is preferred for a more selective filter.
opt
SELECT * FROM t WHERE a @> '{2}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=111.111111]
├── key: (1)
├── fd: (1)-->(2)
└── scan t@a_idx
├── columns: k:1(int!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8a"]
├── stats: [rows=10, distinct(4)=1, null(4)=0]
│ histogram(4)= 0 10 0 0
│ <--- '\x8a' --- '\x8b'
└── key: (1)

# A disjunction requires scanning all entries that match either the left or the
# right.
opt
SELECT * FROM t WHERE a @> '{2}' OR a @> '{3}'
----
index-join t
├── columns: k:1(int!null) a:2(int[]!null)
├── immutable
├── stats: [rows=333.333333, distinct(2)=3, null(2)=0]
├── key: (1)
├── fd: (1)-->(2)
└── inverted-filter
├── columns: k:1(int!null)
├── inverted expression: /4
│ ├── tight: true, unique: false
│ └── union spans: ["\x8a", "\x8c")
├── stats: [rows=20]
├── key: (1)
└── scan t@a_idx
├── columns: k:1(int!null) a_inverted_key:4(int[]!null)
├── inverted constraint: /4/1
│ └── spans: ["\x8a", "\x8c")
├── stats: [rows=20, distinct(1)=19.6078431, null(1)=0, distinct(4)=2, null(4)=0]
│ histogram(4)= 0 10 0 10
│ <--- '\x8a' --- '\x8b'
├── key: (1)
└── fd: (1)-->(4)
Loading