Skip to content

Commit

Permalink
opt: permit coexistance of inv, forward histograms
Browse files Browse the repository at this point in the history
Previously, when trying to use persisted histograms to filter an
inverted column, if the histogram read from disk was a "forward
histogram", the operation would fail with an internal error.

This is because inverted histograms and forward histograms use different
datatypes on disk, and it's not possible to compare a forward histogram
(represented as the datatype of the column) with the inverted constraint
(represented as DBytes).

Now, this problem is corrected by making sure that we only store forward
histogram data in forward histogram column sets, and likewise for
inverted histogram data being only stored in inverted histogram column
sets.

Release note: None
  • Loading branch information
jordanlewis committed Jul 29, 2022
1 parent 293c43c commit 963deb8
Show file tree
Hide file tree
Showing 8 changed files with 557 additions and 41 deletions.
51 changes: 51 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/trigram_indexes
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,54 @@ INSERT INTO pkt VALUES (1, 'abcd'), (2, 'bcde')

statement error primary key column b cannot be present in an inverted index
ALTER TABLE pkt ALTER PRIMARY KEY USING COLUMNS (b)

# Ensure that it's okay to perform an inverted filter on a table with a trigram
# inverted index that only has a forward statistic collected on the inverted
# column.

statement ok
CREATE TABLE b (a) AS SELECT encode(set_byte('foobar ',1,g), 'escape') || g::text FROM generate_series(1,1000) g(g)

statement ok
ANALYZE b

statement ok
CREATE INVERTED INDEX ON b(a gin_trgm_ops)

query T rowsort
SELECT * FROM b WHERE a LIKE '%foo%'
----
foobar 111
foobar 367
foobar 623
foobar 879

# Ensure that scans still work after we re-analyze.

statement ok
ANALYZE b

query T rowsort
SELECT * FROM b WHERE a LIKE '%foo%'
----
foobar 111
foobar 367
foobar 623
foobar 879

statement ok
CREATE INDEX on b(a);
ANALYZE b

query T rowsort
SELECT * FROM b WHERE a LIKE '%foo%'
----
foobar 111
foobar 367
foobar 623
foobar 879

query T
SELECT * FROM b WHERE a = 'foobar 367'
----
foobar 367
5 changes: 5 additions & 0 deletions pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
)

// Table is an interface to a database table, exposing only the information
Expand Down Expand Up @@ -196,6 +197,10 @@ type TableStatistic interface {
// and it represents the distribution of values for that column.
// See HistogramBucket for more details.
Histogram() []HistogramBucket

// HistogramType returns the type that the histogram was created on. For
// inverted index histograms, this will always return types.Bytes.
HistogramType() *types.T
}

// HistogramBucket contains the data for a single histogram bucket. Note
Expand Down
119 changes: 79 additions & 40 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/json"
Expand Down Expand Up @@ -616,15 +617,17 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
// stats from a statistic's column to any associated inverted columns.
// TODO(mgartner): It might be simpler to iterate over all the table columns
// looking for inverted columns.
invertedIndexCols := make(map[int][]int)
invertedIndexCols := make(map[int]invertedIndexColInfo)
for indexI, indexN := 0, tab.IndexCount(); indexI < indexN; indexI++ {
index := tab.Index(indexI)
if !index.IsInverted() {
continue
}
col := index.InvertedColumn()
srcOrd := col.InvertedSourceColumnOrdinal()
invertedIndexCols[srcOrd] = append(invertedIndexCols[srcOrd], col.Ordinal())
info := invertedIndexCols[srcOrd]
info.invIdxColOrds = append(info.invIdxColOrds, col.Ordinal())
invertedIndexCols[srcOrd] = info
}

// Make now and annotate the metadata table with it for next time.
Expand Down Expand Up @@ -656,49 +659,47 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i)))
}

if colStat, ok := stats.ColStats.Add(cols); ok {
needHistogram := cols.Len() == 1 && stat.Histogram() != nil &&
sb.evalCtx.SessionData().OptimizerUseHistograms
seenInvertedStat := false
invertedStatistic := false
var invertedColOrds []int
if needHistogram {
info := invertedIndexCols[stat.ColumnOrdinal(0)]
invertedColOrds = info.invIdxColOrds
seenInvertedStat = info.foundInvertedHistogram
// If some of the columns are inverted and the statistics is of type
// BYTES, it means we have an inverted statistic on this column set.
invertedStatistic = len(invertedColOrds) > 0 && stat.HistogramType().Family() == types.BytesFamily
}

colStat, ok := stats.ColStats.Add(cols)
if ok || (colStat.Histogram == nil && !invertedStatistic && seenInvertedStat) {
// Set the statistic if either:
// 1. We have no statistic for the current colset at all
// 2. All of the following conditions hold:
// a. The previously found statistic for the colset has no histogram
// b. the current statistic is not inverted
// c. the previously found statistic for this colset was inverted
// If these conditions hold, it means that the previous histogram
// we found for the current colset was derived from an inverted
// histogram, and therefore the existing forward statistic doesn't have
// a histogram at all, and the new statistic we just found has a
// non-inverted histogram that we should be using instead.
colStat.DistinctCount = float64(stat.DistinctCount())
colStat.NullCount = float64(stat.NullCount())
colStat.AvgSize = float64(stat.AvgSize())
if cols.Len() == 1 && stat.Histogram() != nil &&
sb.evalCtx.SessionData().OptimizerUseHistograms {
col, _ := cols.Next(0)

// If this column is invertable, the histogram describes the inverted index
// entries, and we need to create a new stat for it, and not apply a histogram
// to the source column.
invertedColOrds := invertedIndexCols[stat.ColumnOrdinal(0)]
if len(invertedColOrds) == 0 {
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
} else {
for _, invertedColOrd := range invertedColOrds {
invCol := tabID.ColumnID(invertedColOrd)
invCols := opt.MakeColSet(invCol)
if invColStat, ok := stats.ColStats.Add(invCols); ok {
invColStat.Histogram = &props.Histogram{}
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram())
// Set inverted entry counts from the histogram. Make sure the
// distinct count is at least 1, for the same reason as the row
// count above.
invColStat.DistinctCount = max(invColStat.Histogram.DistinctValuesCount(), 1)
// Inverted indexes don't have nulls.
invColStat.NullCount = 0
if stat.AvgSize() == 0 {
invColStat.AvgSize = defaultColSize
} else {
invColStat.AvgSize = float64(stat.AvgSize())
}
}
}
}
if needHistogram && !invertedStatistic {
// A statistic is inverted if the column is invertible and its
// histogram contains buckets of types BYTES.
// NOTE: this leaves an ambiguity which would surface if we ever
// permitted an inverted index on BYTES-type columns. A deeper fix
// is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
col := cols.SingleColumn()
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
}

// Fetch the colStat again since it may now have a different address due
// to calling stats.ColStats.Add() on any inverted column statistics
// created above.
colStat, _ = stats.ColStats.Lookup(cols)

// Make sure the distinct count is at least 1, for the same reason as
// the row count above.
colStat.DistinctCount = max(colStat.DistinctCount, 1)
Expand All @@ -708,12 +709,50 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
// count).
sb.finalizeFromRowCountAndDistinctCounts(colStat, stats)
}

// Add inverted histograms if necessary.
if needHistogram && invertedStatistic {
// Record the fact that we are adding an inverted statistic to this
// column set.
info := invertedIndexCols[stat.ColumnOrdinal(0)]
info.foundInvertedHistogram = true
invertedIndexCols[stat.ColumnOrdinal(0)] = info
for _, invertedColOrd := range invertedColOrds {
invCol := tabID.ColumnID(invertedColOrd)
invCols := opt.MakeColSet(invCol)
if invColStat, ok := stats.ColStats.Add(invCols); ok {
invColStat.Histogram = &props.Histogram{}
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram())
// Set inverted entry counts from the histogram. Make sure the
// distinct count is at least 1, for the same reason as the row
// count above.
invColStat.DistinctCount = max(invColStat.Histogram.DistinctValuesCount(), 1)
// Inverted indexes don't have nulls.
invColStat.NullCount = 0
if stat.AvgSize() == 0 {
invColStat.AvgSize = defaultColSize
} else {
invColStat.AvgSize = float64(stat.AvgSize())
}
}
}
}
}
}
sb.md.SetTableAnnotation(tabID, statsAnnID, stats)
return stats
}

// invertedIndexColInfo is used to store information about an inverted column.
type invertedIndexColInfo struct {
// invIdxColOrds is the list of inverted index column ordinals for a given
// inverted column.
invIdxColOrds []int
// foundInvertedHistogram is set to true if we've found an inverted histogram
// for a given inverted column.
foundInvertedHistogram bool
}

func (sb *statisticsBuilder) colStatTable(
tabID opt.TableID, colSet opt.ColSet,
) *props.ColumnStatistic {
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/opt/memo/testdata/stats/inverted-geo
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ ALTER TABLE t62289 INJECT STATISTICS e'[
"upper_bound": "0102000020E61000000300000005D8E086BB6365C03F9E5737DD1A53C0C04ECDED673B55C06711C00C7C0240C0B8EABD96072856404A9D2C529FC74EC0"
}
],
"histo_col_type": "GEOGRAPHY",
"histo_col_type": "BYTES",
"name": "__auto__",
"null_count": 0,
"row_count": 0
Expand Down
Loading

0 comments on commit 963deb8

Please sign in to comment.