From 0f1ce331f19d9bd9fa25ce8c8b0bb36562231f73 Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 8 Apr 2022 21:07:52 -0400 Subject: [PATCH 1/4] sql: implement trigram inverted indexes This commit implements trigram inverted indexes that are searchable using the =, LIKE, ILIKE and % (similarity) operators. This is a partial implementation of the gin_trgm_ops opclass from Postgres. gin_trgm_ops also supports regex matches in Postgres. Release note (sql change): string columns now support inverted indexes using trigrams. These indexes can be searched using the =, LIKE, ILIKE and % (similarity) predicates. --- pkg/internal/sqlsmith/alter.go | 2 +- pkg/sql/catalog/colinfo/col_type_info.go | 24 +- pkg/sql/create_stats.go | 6 +- .../testdata/logic_test/trigram_indexes | 109 +++++++ .../exec/execbuilder/testdata/trigram_index | 272 ++++++++++++++++++ pkg/sql/opt/invertedidx/BUILD.bazel | 2 + .../opt/invertedidx/inverted_index_expr.go | 18 +- pkg/sql/opt/invertedidx/trigram.go | 103 +++++++ pkg/sql/opt/memo/statistics_builder.go | 2 +- pkg/sql/randgen/mutator.go | 4 +- pkg/sql/randgen/schema.go | 2 +- pkg/sql/rowenc/BUILD.bazel | 3 + pkg/sql/rowenc/index_encoding.go | 83 ++++++ pkg/sql/rowenc/index_encoding_test.go | 204 +++++++++++++ pkg/util/trigram/trigram.go | 4 + 15 files changed, 821 insertions(+), 17 deletions(-) create mode 100644 pkg/sql/logictest/testdata/logic_test/trigram_indexes create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/trigram_index create mode 100644 pkg/sql/opt/invertedidx/trigram.go diff --git a/pkg/internal/sqlsmith/alter.go b/pkg/internal/sqlsmith/alter.go index 5d4713b39cc4..322726f567b8 100644 --- a/pkg/internal/sqlsmith/alter.go +++ b/pkg/internal/sqlsmith/alter.go @@ -317,7 +317,7 @@ func makeCreateIndex(s *Smither) (tree.Statement, bool) { seen[col.Name] = true // If this is the first column and it's invertible (i.e., JSONB), make an inverted index. if len(cols) == 0 && - colinfo.ColumnTypeIsInvertedIndexable(tree.MustBeStaticallyKnownType(col.Type)) { + colinfo.ColumnTypeIsOnlyInvertedIndexable(tree.MustBeStaticallyKnownType(col.Type)) { inverted = true unique = false cols = append(cols, tree.IndexElem{ diff --git a/pkg/sql/catalog/colinfo/col_type_info.go b/pkg/sql/catalog/colinfo/col_type_info.go index 74f4eaf5d3ee..4fe54bc7c2c8 100644 --- a/pkg/sql/catalog/colinfo/col_type_info.go +++ b/pkg/sql/catalog/colinfo/col_type_info.go @@ -123,18 +123,34 @@ func ColumnTypeIsIndexable(t *types.T) bool { } // Some inverted index types also have a key encoding, but we don't // want to support those yet. See #50659. - return !MustBeValueEncoded(t) && !ColumnTypeIsInvertedIndexable(t) + return !MustBeValueEncoded(t) && !ColumnTypeIsOnlyInvertedIndexable(t) } // ColumnTypeIsInvertedIndexable returns whether the type t is valid to be indexed // using an inverted index. func ColumnTypeIsInvertedIndexable(t *types.T) bool { + switch t.Family() { + case types.StringFamily: + return true + } + return ColumnTypeIsOnlyInvertedIndexable(t) +} + +// ColumnTypeIsOnlyInvertedIndexable returns true if the type t is only +// indexable via an inverted index. +func ColumnTypeIsOnlyInvertedIndexable(t *types.T) bool { if t.IsAmbiguous() || t.Family() == types.TupleFamily { return false } - family := t.Family() - return family == types.JsonFamily || family == types.ArrayFamily || - family == types.GeographyFamily || family == types.GeometryFamily + switch t.Family() { + case types.JsonFamily: + case types.ArrayFamily: + case types.GeographyFamily: + case types.GeometryFamily: + default: + return false + } + return true } // MustBeValueEncoded returns true if columns of the given kind can only be value diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go index 6c9ef4b34ebd..ceb3f600228c 100644 --- a/pkg/sql/create_stats.go +++ b/pkg/sql/create_stats.go @@ -287,7 +287,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro if err != nil { return nil, err } - isInvIndex := colinfo.ColumnTypeIsInvertedIndexable(col.GetType()) + isInvIndex := colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()) colStats = []jobspb.CreateStatsDetails_ColStat{{ ColumnIDs: columnIDs, // By default, create histograms on all explicitly requested column stats @@ -524,7 +524,7 @@ func createStatsDefaultColumns( if err != nil { return nil, err } - isInverted := colinfo.ColumnTypeIsInvertedIndexable(col.GetType()) + isInverted := colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()) if err := addIndexColumnStatsIfNotExists(colID, isInverted); err != nil { return nil, err } @@ -558,7 +558,7 @@ func createStatsDefaultColumns( } colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{ ColumnIDs: colList, - HasHistogram: !colinfo.ColumnTypeIsInvertedIndexable(col.GetType()), + HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()), HistogramMaxBuckets: maxHistBuckets, }) nonIdxCols++ diff --git a/pkg/sql/logictest/testdata/logic_test/trigram_indexes b/pkg/sql/logictest/testdata/logic_test/trigram_indexes new file mode 100644 index 000000000000..b77aaaf6a0b8 --- /dev/null +++ b/pkg/sql/logictest/testdata/logic_test/trigram_indexes @@ -0,0 +1,109 @@ +statement ok +CREATE TABLE a (a INT PRIMARY KEY, t TEXT) + +statement ok +CREATE INVERTED INDEX ON a(t) + +statement ok +INSERT INTO a VALUES (1, 'foozoopa'), + (2, 'Foo'), + (3, 'blah') + +query IT +SELECT * FROM a@a_t_idx WHERE t ILIKE '%Foo%' +---- +1 foozoopa +2 Foo + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%Foo%' +---- +2 Foo + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE 'Foo%' +---- +2 Foo + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%Foo' +---- +2 Foo + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%foo%oop%' +---- +1 foozoopa + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%fooz%' +---- +1 foozoopa + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%foo%oop' +---- + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE 'zoo' +---- + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE '%foo%oop%' OR t ILIKE 'blah' ORDER BY a +---- +1 foozoopa +3 blah + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE 'blahf' +---- + +query IT +SELECT * FROM a@a_t_idx WHERE t LIKE 'fblah' +---- + +# Test the acceleration of the % similarity operator. +# By default, the threshold for searching is .3. +query FIT +SELECT similarity(t, 'blar'), * FROM a@a_t_idx WHERE t % 'blar' +---- +0.428571428571429 3 blah + +query FIT +SELECT similarity(t, 'byar'), * FROM a@a_t_idx WHERE t % 'byar' +---- + +query FIT +SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz' ORDER BY a +---- +0.4 1 foozoopa +0.5 2 Foo + +statement ok +SET pg_trgm.similarity_threshold=.45 + +query FIT +SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz' +---- +0.5 2 Foo + +# Test the acceleration of the equality operator. +query IT +SELECT * FROM a@a_t_idx WHERE t = 'Foo' +---- +2 Foo + +query IT +SELECT * FROM a@a_t_idx WHERE t = 'foo' +---- + +query IT +SELECT * FROM a@a_t_idx WHERE t = 'foozoopa' +---- +1 foozoopa + +query IT +SELECT * FROM a@a_t_idx WHERE t = 'foozoopa' OR t = 'Foo' ORDER BY a +---- +1 foozoopa +2 Foo diff --git a/pkg/sql/opt/exec/execbuilder/testdata/trigram_index b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index new file mode 100644 index 000000000000..9dc58ff19220 --- /dev/null +++ b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index @@ -0,0 +1,272 @@ +# LogicTest: local + +statement ok +CREATE TABLE a ( + a INT PRIMARY KEY, + b TEXT, + FAMILY (a,b), + INVERTED INDEX(b) +) + +query T +EXPLAIN SELECT * FROM a WHERE b LIKE '%foo%' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b LIKE '%foo%' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a WHERE b ILIKE '%foo%' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b ILIKE '%foo%' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a WHERE b LIKE '%foo%' OR b ILIKE '%bar%' +---- +distribution: local +vectorized: true +· +• filter +│ filter: (b LIKE '%foo%') OR (b ILIKE '%bar%') +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a WHERE b LIKE '%foo%' OR b ILIKE '%bar%' +---- +distribution: local +vectorized: true +· +• filter +│ filter: (b LIKE '%foo%') OR (b ILIKE '%bar%') +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a WHERE b LIKE '%foo%zoo%' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b LIKE '%foo%zoo%' +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +# Test that trigram index can't support searches with fewer than 3 characters. +query T +EXPLAIN SELECT * FROM a WHERE b LIKE '%fo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b LIKE '%fo' +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + +# Test that trigram indexes can't support searches with no constant args. +# columns. +query T +EXPLAIN SELECT * FROM a WHERE b LIKE b +---- +distribution: local +vectorized: true +· +• filter +│ filter: b LIKE b +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + +# Test that trigram indexes accelerate the % operator. +query T +EXPLAIN SELECT * FROM a WHERE b % 'foo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'foo' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +# Test that trigram indexes accelerate the % operator with an OR if the +# constant has more than one trigram. +query T +EXPLAIN SELECT * FROM a WHERE b % 'foob' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'foob' +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +# Test that trigram indexes can't accelerate the % operator if there are fewer +# than 3 characters in the constant. +query T +EXPLAIN SELECT * FROM a WHERE b % 'fo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'fo' +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + +# Test that trigram indexes can accelerate the % operator in reverse order. +query T +EXPLAIN SELECT * FROM a WHERE 'blah' % b +---- +distribution: local +vectorized: true +· +• filter +│ filter: 'blah' % b +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +# Test that trigram indexes can't accelerate the % operator with no constant +# columns. +query T +EXPLAIN SELECT * FROM a WHERE b % b +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % b +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + + +# Test that trigram indexes can accelerate the equality operator. +query T +EXPLAIN SELECT * FROM a WHERE b = 'foobar' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b = 'foobar' +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 4 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 4 spans + +query T +EXPLAIN SELECT * FROM a WHERE b = 'foo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b = 'foo' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span diff --git a/pkg/sql/opt/invertedidx/BUILD.bazel b/pkg/sql/opt/invertedidx/BUILD.bazel index faeabb48150b..869a1bc67ef4 100644 --- a/pkg/sql/opt/invertedidx/BUILD.bazel +++ b/pkg/sql/opt/invertedidx/BUILD.bazel @@ -6,6 +6,7 @@ go_library( "geo.go", "inverted_index_expr.go", "json_array.go", + "trigram.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx", visibility = ["//visibility:public"], @@ -46,6 +47,7 @@ go_test( srcs = [ "geo_test.go", "json_array_test.go", + "trigram_test.go", ], deps = [ ":invertedidx", diff --git a/pkg/sql/opt/invertedidx/inverted_index_expr.go b/pkg/sql/opt/invertedidx/inverted_index_expr.go index 7d13a9f5940b..54e7fcc7fabf 100644 --- a/pkg/sql/opt/invertedidx/inverted_index_expr.go +++ b/pkg/sql/opt/invertedidx/inverted_index_expr.go @@ -109,13 +109,21 @@ func TryFilterInvertedIndex( } typ = types.Geometry } else { - filterPlanner = &jsonOrArrayFilterPlanner{ - tabID: tabID, - index: index, - computedColumns: computedColumns, - } col := index.InvertedColumn().InvertedSourceColumnOrdinal() typ = factory.Metadata().Table(tabID).Column(col).DatumType() + if typ.Family() == types.StringFamily { + filterPlanner = &trigramFilterPlanner{ + tabID: tabID, + index: index, + computedColumns: computedColumns, + } + } else { + filterPlanner = &jsonOrArrayFilterPlanner{ + tabID: tabID, + index: index, + computedColumns: computedColumns, + } + } } var invertedExpr inverted.Expression diff --git a/pkg/sql/opt/invertedidx/trigram.go b/pkg/sql/opt/invertedidx/trigram.go new file mode 100644 index 000000000000..756aff4e96d4 --- /dev/null +++ b/pkg/sql/opt/invertedidx/trigram.go @@ -0,0 +1,103 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx + +import ( + "github.com/cockroachdb/cockroach/pkg/sql/inverted" + "github.com/cockroachdb/cockroach/pkg/sql/opt" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr" + "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" + "github.com/cockroachdb/cockroach/pkg/sql/rowenc" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/errors" +) + +type trigramFilterPlanner struct { + tabID opt.TableID + index cat.Index + computedColumns map[opt.ColumnID]opt.ScalarExpr +} + +var _ invertedFilterPlanner = &trigramFilterPlanner{} + +// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner +// interface. +func (t *trigramFilterPlanner) extractInvertedFilterConditionFromLeaf( + _ *eval.Context, expr opt.ScalarExpr, +) ( + invertedExpr inverted.Expression, + remainingFilters opt.ScalarExpr, + _ *invertedexpr.PreFiltererStateForInvertedFilterer, +) { + var constantVal opt.ScalarExpr + var left, right opt.ScalarExpr + var allMustMatch bool + switch e := expr.(type) { + // Both ILIKE and LIKE are supported because the index entries are always + // downcased. We re-check the condition no matter what later. + case *memo.ILikeExpr: + left, right = e.Left, e.Right + // If we're doing a LIKE (or ILIKE) expression, we need to construct an AND + // out of all of the spans: we need to find results that match every single + // one of the trigrams in the constant datum. + allMustMatch = true + case *memo.LikeExpr: + left, right = e.Left, e.Right + allMustMatch = true + case *memo.EqExpr: + left, right = e.Left, e.Right + allMustMatch = true + case *memo.ModExpr: + // If we're doing a % expression (similarity threshold), we need to + // construct an OR out of the spans: we need to find results that match any + // of the trigrams in the constant datum, and we'll filter the results + // further afterwards. + left, right = e.Left, e.Right + allMustMatch = false + default: + // Only the above types are supported. + return inverted.NonInvertedColExpression{}, expr, nil + } + if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) { + constantVal = right + } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) { + constantVal = left + } else { + // Can only accelerate with a single constant value. + return inverted.NonInvertedColExpression{}, expr, nil + } + d := memo.ExtractConstDatum(constantVal) + if d.ResolvedType() != types.String { + panic(errors.AssertionFailedf( + "trying to apply inverted index to unsupported type %s", d.ResolvedType(), + )) + } + s := string(*d.(*tree.DString)) + var err error + invertedExpr, err = rowenc.EncodeTrigramSpans(s, allMustMatch) + if err != nil { + // An inverted expression could not be extracted. + return inverted.NonInvertedColExpression{}, expr, nil + } + + // If the extracted inverted expression is not tight then remaining filters + // must be applied after the inverted index scan. + if !invertedExpr.IsTight() { + remainingFilters = expr + } + + // We do not currently support pre-filtering for JSON and Array indexes, so + // the returned pre-filter state is nil. + return invertedExpr, remainingFilters, nil +} diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 603f2c36c878..88166afb23c7 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -4583,7 +4583,7 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints( var hasNullValue, ok bool var values tree.Datums var distinctVals uint64 - invertedIndexableColumnType := colinfo.ColumnTypeIsInvertedIndexable(colType) + invertedIndexableColumnType := colinfo.ColumnTypeIsOnlyInvertedIndexable(colType) if distinctVals, ok = filterConstraint.CalculateMaxResults(sb.evalCtx, cols, cols); ok { // If the number of values is excessive, don't spend too much time building the histogram, // as it may slow down the query. diff --git a/pkg/sql/randgen/mutator.go b/pkg/sql/randgen/mutator.go index af87a30d658a..1ca9a6a79239 100644 --- a/pkg/sql/randgen/mutator.go +++ b/pkg/sql/randgen/mutator.go @@ -287,7 +287,7 @@ func statisticsMutator( // bounds are byte-encoded inverted index keys. func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData { histogramColType := colType - if colinfo.ColumnTypeIsInvertedIndexable(colType) { + if colinfo.ColumnTypeIsOnlyInvertedIndexable(colType) { histogramColType = types.Bytes } h := stats.HistogramData{ @@ -298,7 +298,7 @@ func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData { var encodedUpperBounds [][]byte for i, numDatums := 0, rng.Intn(10); i < numDatums; i++ { upper := RandDatum(rng, colType, false /* nullOk */) - if colinfo.ColumnTypeIsInvertedIndexable(colType) { + if colinfo.ColumnTypeIsOnlyInvertedIndexable(colType) { encs := encodeInvertedIndexHistogramUpperBounds(colType, upper) encodedUpperBounds = append(encodedUpperBounds, encs...) } else { diff --git a/pkg/sql/randgen/schema.go b/pkg/sql/randgen/schema.go index ca6f29cf71c0..266f787019e3 100644 --- a/pkg/sql/randgen/schema.go +++ b/pkg/sql/randgen/schema.go @@ -491,7 +491,7 @@ func randIndexTableDefFromCols( // The last index column can be inverted-indexable, which makes the // index an inverted index. - if colinfo.ColumnTypeIsInvertedIndexable(semType) { + if colinfo.ColumnTypeIsOnlyInvertedIndexable(semType) { def.Inverted = true stopPrefix = true } diff --git a/pkg/sql/rowenc/BUILD.bazel b/pkg/sql/rowenc/BUILD.bazel index e17e093e7a63..0e006b40d8ab 100644 --- a/pkg/sql/rowenc/BUILD.bazel +++ b/pkg/sql/rowenc/BUILD.bazel @@ -34,6 +34,7 @@ go_library( "//pkg/util/json", "//pkg/util/mon", "//pkg/util/protoutil", + "//pkg/util/trigram", "//pkg/util/unique", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_redact//:redact", @@ -67,6 +68,7 @@ go_test( "//pkg/sql/catalog/desctestutils", "//pkg/sql/catalog/tabledesc", "//pkg/sql/inverted", + "//pkg/sql/parser", "//pkg/sql/randgen", "//pkg/sql/rowenc/valueside", "//pkg/sql/sem/eval", @@ -78,6 +80,7 @@ go_test( "//pkg/util/json", "//pkg/util/leaktest", "//pkg/util/randutil", + "//pkg/util/trigram", "//pkg/util/uuid", "@com_github_cockroachdb_apd_v3//:apd", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go index c03bc2f007d8..1e80a26deeed 100644 --- a/pkg/sql/rowenc/index_encoding.go +++ b/pkg/sql/rowenc/index_encoding.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/json" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/cockroach/pkg/util/trigram" "github.com/cockroachdb/cockroach/pkg/util/unique" "github.com/cockroachdb/errors" ) @@ -578,6 +579,13 @@ func EncodeInvertedIndexTableKeys( return json.EncodeInvertedIndexKeys(inKey, val.(*tree.DJSON).JSON) case types.ArrayFamily: return encodeArrayInvertedIndexTableKeys(val.(*tree.DArray), inKey, version, false /* excludeNulls */) + case types.StringFamily: + // TODO(jordan): Right now, this is just trigram inverted indexes. What if + // we want to support different types of inverted indexes on strings? We'll + // need to pass in the index type to this function. + // We pad the keys when writing them to the index. + // TODO(jordan): why are we doing this padding at all? Postgres does it. + return encodeTrigramInvertedIndexTableKeys(string(*val.(*tree.DString)), inKey, version, true /* pad */) } return nil, errors.AssertionFailedf("trying to apply inverted index to unsupported type %s", datum.ResolvedType()) } @@ -881,6 +889,59 @@ func encodeOverlapsArrayInvertedIndexSpans( return invertedExpr, nil } +// EncodeTrigramSpans returns the spans that must be scanned to look up trigrams +// present in the input string. If allMustMatch is true, the resultant inverted +// expression must match every trigram in the input. Otherwise, it will match +// any trigram in the input. +func EncodeTrigramSpans(s string, allMustMatch bool) (inverted.Expression, error) { + // We do not pad the trigrams when searching the index. To see why, observe + // the keys that we insert for a string "zfooz": + // + // " z", " zf", "zfo", "foo", "foz", "oz " + // + // If we were then searching for the string %foo%, and we padded the output + // keys as well, we'd be searching for the key " f", which doesn't exist + // in the index for zfooz, even though zfooz is like %foo%. + keys, err := encodeTrigramInvertedIndexTableKeys(s, nil, /* inKey */ + descpb.LatestIndexDescriptorVersion, false /* pad */) + if err != nil { + return nil, err + } + if len(keys) == 0 { + return nil, errors.New("no trigrams available to search with") + } + + var ret inverted.Expression + for _, key := range keys { + spanExpr := inverted.ExprForSpan(inverted.MakeSingleValSpan(key), false /* tight */) + if ret == nil { + // The first trigram (and only the first trigram) is unique. + // TODO(jordan): we *could* make this first expression tight if we knew + // for sure that the expression is something like `LIKE '%foo%'`. In this + // case, we're sure that the returned row will pass the predicate because + // the LIKE operator has wildcards on either side of the trigram. But + // this is such a marginal case that it doesn't seem worth it to plumb + // in this special case. For all other single-trigram cases, such as + // `LIKE '%foo'` or `= 'foo'`, we don't have a tight span. + spanExpr.Unique = true + ret = spanExpr + } else { + // As soon as we have more than one trigram to search for, we no longer + // have a unique expression, since two separate trigrams could both + // point at a single row. We also no longer have a tight expression, + // because the trigrams that we're checking don't necessarily have to + // be in the right order within the string to guarantee that just because + // both trigrams match, the strings pass the LIKE or % test. + if allMustMatch { + ret = inverted.And(ret, spanExpr) + } else { + ret = inverted.Or(ret, spanExpr) + } + } + } + return ret, nil +} + // EncodeGeoInvertedIndexTableKeys is the equivalent of EncodeInvertedIndexTableKeys // for Geography and Geometry. func EncodeGeoInvertedIndexTableKeys( @@ -929,6 +990,28 @@ func encodeGeoKeys( return keys, nil } +// encodeTrigramInvertedIndexTableKeys produces the trigram index table keys for +// an input string. If pad is true, the returned table keys will include 3 extra +// trigrams produced by padding the string with 2 spaces at the front and 1 at +// the end. +func encodeTrigramInvertedIndexTableKeys( + val string, inKey []byte, _ descpb.IndexDescriptorVersion, pad bool, +) ([][]byte, error) { + trigrams := trigram.MakeTrigrams(val, pad) + outKeys := make([][]byte, len(trigrams)) + for i := range trigrams { + // Make sure to copy inKey into a new byte slice to avoid aliasing. + inKeyLen := len(inKey) + // Pre-size the outkey - we know we're going to encode the trigram plus 2 + // extra bytes for the prefix and terminator. + outKey := make([]byte, inKeyLen, inKeyLen+len(trigrams[i])+2) + copy(outKey, inKey) + newKey := encoding.EncodeStringAscending(outKey, trigrams[i]) + outKeys[i] = newKey + } + return outKeys, nil +} + // EncodePrimaryIndex constructs a list of k/v pairs for a // row encoded as a primary index. This function mirrors the encoding // logic in prepareInsertOrUpdateBatch in pkg/sql/row/writer.go. diff --git a/pkg/sql/rowenc/index_encoding_test.go b/pkg/sql/rowenc/index_encoding_test.go index 45b70847aadc..fdf6527c9eea 100644 --- a/pkg/sql/rowenc/index_encoding_test.go +++ b/pkg/sql/rowenc/index_encoding_test.go @@ -14,6 +14,7 @@ import ( "bytes" "context" "fmt" + "sort" "testing" "github.com/cockroachdb/cockroach/pkg/keys" @@ -25,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc" "github.com/cockroachdb/cockroach/pkg/sql/inverted" + "github.com/cockroachdb/cockroach/pkg/sql/parser" "github.com/cockroachdb/cockroach/pkg/sql/randgen" . "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" @@ -32,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/json" "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/util/trigram" "github.com/stretchr/testify/require" ) @@ -891,3 +894,204 @@ func containsNonNullUniqueElement(ctx *eval.Context, valArr *tree.DArray) bool { } return lastVal != tree.DNull } + +type trigramSearchType int + +const ( + like trigramSearchType = iota + similar + eq +) + +func TestEncodeTrigramInvertedIndexSpans(t *testing.T) { + testCases := []struct { + // The value that's being indexed in the trigram index. + indexedValue string + // The value that's being turned into spans to search with. + value string + // Whether we're using LIKE or % operator for the search. + searchType trigramSearchType + // Whether we expect that the spans should contain all of the keys produced + // by indexing the indexedValue. + containsKeys bool + // Whether we expect that the indexed value should evaluate as matching + // the LIKE or % expression that we're testing. + expected bool + unique bool + }{ + + // This test uses EncodeInvertedIndexTableKeys and EncodeTrigramSpans + // to determine if the spans produced from the second string value will + // correctly include or exclude the first value. + + {`foobarbaz`, `%oob%baz`, like, true, true, false}, + {`foobarbaz`, `%oob%`, like, true, true, true}, + // Test that the order of the trigrams doesn't matter for containment, but + // does matter for evaluation. + {`staticcheck`, `%check%static%`, like, true, false, false}, + // Make sure that we can satisfy a query that includes a chunk that is too + // short to produce any trigrams at all. + {`test`, `%a%bar`, like, false, false, true}, + + // "Reverse order" trigrams shouldn't match. + {`test`, `tse`, like, false, false, true}, + + // Similarity (%) queries. + {`staticcheck`, `staricheck`, similar, true, true, false}, + {`staticcheck`, `blevicchlrk`, similar, true, false, false}, + {`staticcheck`, `che`, similar, true, false, true}, + {`staticcheck`, `xxx`, similar, false, false, true}, + {`staticcheck`, `xxxyyy`, similar, false, false, false}, + + // Equality queries. + {`staticcheck`, `staticcheck`, eq, true, true, false}, + {`staticcheck`, `staticcheckz`, eq, false, false, false}, + {`staticcheck`, `zstaticcheck`, eq, false, false, false}, + {`baba`, `abab`, eq, true, false, false}, + {`foo`, `foo`, eq, true, true, true}, + {`foo`, `bar`, eq, false, false, true}, + + {`eabc`, `eabd`, eq, false, false, false}, + } + + evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings()) + evalCtx.SessionData().TrigramSimilarityThreshold = .3 + + runTest := func(indexedValue, value string, searchType trigramSearchType, + expectContainsKeys, expected, expectUnique bool) { + t.Logf("test case: %s %s %v %t %t %t", indexedValue, value, searchType, expectContainsKeys, expected, expectUnique) + keys, err := EncodeInvertedIndexTableKeys(tree.NewDString(indexedValue), nil, descpb.LatestIndexDescriptorVersion) + require.NoError(t, err) + + typedExpr := makeTrigramBinOp(t, indexedValue, value, searchType) + invertedExpr, err := EncodeTrigramSpans(value, searchType != similar) + require.NoError(t, err) + + spanExpr, ok := invertedExpr.(*inverted.SpanExpression) + if !ok { + t.Fatalf("invertedExpr %v is not a SpanExpression", invertedExpr) + } + + if spanExpr.Tight { + // We never expect the inverted expressions for trigrams to be tight. + t.Fatalf("unexpectedly found a tight expression") + } + require.Equal(t, expectUnique, spanExpr.Unique, "%s, %s: unexpected unique attribute", indexedValue, value) + + // Check if the indexedValue is included by the spans. + containsKeys, err := spanExpr.ContainsKeys(keys) + require.NoError(t, err) + + require.Equal(t, expectContainsKeys, containsKeys, "%s, %s: expected containsKeys", indexedValue, value) + + // Since the spans are never tight, apply an additional filter to determine + // if the result is contained. + datum, err := eval.Expr(&evalCtx, typedExpr) + require.NoError(t, err) + actual := bool(*datum.(*tree.DBool)) + require.Equal(t, expected, actual, "%s, %s: expected evaluation result to match", indexedValue, value) + } + + // Run pre-defined test cases from above. + for _, c := range testCases { + runTest(c.indexedValue, c.value, c.searchType, c.containsKeys, c.expected, c.unique) + } + + // Run some random test cases. + + rng, _ := randutil.NewTestRand() + for i := 0; i < 100; i++ { + const alphabet = "abcdefg" + + // Generate two random strings and evaluate left % right, left LIKE right, + // and left = right both via eval and via the span comparisons. + left := randgen.RandString(rng, 15, alphabet) + length := 3 + rng.Intn(5) + right := randgen.RandString(rng, length, alphabet+"%") + + for _, searchType := range []trigramSearchType{like, eq, similar} { + expr := makeTrigramBinOp(t, left, right, searchType) + lTrigrams := trigram.MakeTrigrams(left, false /* pad */) + // Check for intersection. We're looking for a non-zero intersection + // for similar, and complete containment of the right trigrams in the left + // for eq and like. + any := false + all := true + rTrigrams := trigram.MakeTrigrams(right, false /* pad */) + for _, trigram := range rTrigrams { + idx := sort.Search(len(lTrigrams), func(i int) bool { + return lTrigrams[i] >= trigram + }) + if idx < len(lTrigrams) && lTrigrams[idx] == trigram { + any = true + } else { + all = false + } + } + var expectedContainsKeys bool + if searchType == similar { + expectedContainsKeys = any + } else { + expectedContainsKeys = all + } + + d, err := eval.Expr(&evalCtx, expr) + require.NoError(t, err) + expected := bool(*d.(*tree.DBool)) + trigrams := trigram.MakeTrigrams(right, false /* pad */) + nTrigrams := len(trigrams) + valid := nTrigrams > 0 + unique := nTrigrams == 1 + if !valid { + _, err := EncodeTrigramSpans(right, true /* allMustMatch */) + require.Error(t, err) + continue + } + runTest(left, right, searchType, expectedContainsKeys, expected, unique) + } + } +} + +func makeTrigramBinOp( + t *testing.T, indexedValue string, value string, searchType trigramSearchType, +) (typedExpr tree.TypedExpr) { + var opstr string + switch searchType { + case like: + opstr = "LIKE" + case eq: + opstr = "=" + case similar: + opstr = "%" + default: + panic("no such searchtype") + } + expr, err := parser.ParseExpr(fmt.Sprintf("'%s' %s '%s'", indexedValue, opstr, value)) + require.NoError(t, err) + + semaContext := tree.MakeSemaContext() + typedExpr, err = tree.TypeCheck(context.Background(), expr, &semaContext, types.Bool) + require.NoError(t, err) + return typedExpr +} + +func TestEncodeTrigramInvertedIndexSpansError(t *testing.T) { + // Make sure that any input with a chunk with fewer than 3 characters returns + // an error, since we can't produce trigrams from strings that don't meet a + // minimum of 3 characters. + inputs := []string{ + "fo", + "a", + "", + // Non-alpha characters don't count against the limit. + "fo ", + "%fo%", + "#$(*", + } + for _, input := range inputs { + _, err := EncodeTrigramSpans(input, true /* allMustMatch */) + require.Error(t, err) + _, err = EncodeTrigramSpans(input, false /* allMustMatch */) + require.Error(t, err) + } +} diff --git a/pkg/util/trigram/trigram.go b/pkg/util/trigram/trigram.go index bb41e9599ec2..e3e8de7a8e5a 100644 --- a/pkg/util/trigram/trigram.go +++ b/pkg/util/trigram/trigram.go @@ -52,6 +52,10 @@ func MakeTrigrams(s string, pad bool) []string { } } + if len(output) == 0 { + return output + } + // Sort the array and deduplicate. sort.Strings(output) From c100e371b0f5329ca7d7fe5526a6cfef35631eff Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 6 May 2022 18:16:41 -0400 Subject: [PATCH 2/4] sql: permit opclasses for inverted indexes Previously, it was not permitted to use opclass syntax for index creation. For example, the `blah_ops` in `CREATE INDEX ON t USING GIN (col blah_ops))`. Now, this syntax is legal when the opclass is supported for a given type. Release note (sql change): permit usage of jsonb_ops, array_ops, gin_trgm_ops, and gist_trgm_ops as an opclass in inverted index creation. --- pkg/sql/catalog/catpb/catalog.proto | 20 +++++ pkg/sql/catalog/descpb/structured.proto | 7 +- pkg/sql/catalog/table_elements.go | 4 + pkg/sql/catalog/tabledesc/index.go | 17 ++++ pkg/sql/catalog/tabledesc/structured.go | 24 +++--- pkg/sql/catalog/tabledesc/validate_test.go | 3 +- pkg/sql/create_index.go | 84 ++++++++++++++++--- pkg/sql/create_table.go | 12 +-- .../testdata/logic_test/create_index | 13 +++ .../testdata/logic_test/inverted_index | 26 ++++++ .../logic_test/inverted_index_geospatial | 8 ++ .../testdata/logic_test/trigram_indexes | 18 +++- .../exec/execbuilder/testdata/trigram_index | 2 +- pkg/sql/opt/memo/statistics_builder.go | 6 +- pkg/sql/parser/parse_test.go | 3 - pkg/sql/parser/sql.y | 8 +- pkg/sql/rowenc/index_encoding.go | 6 +- pkg/sql/sem/builtins/builtins.go | 8 +- pkg/sql/sem/tree/create.go | 10 +++ pkg/sql/testdata/telemetry/trigrams | 14 ---- 20 files changed, 226 insertions(+), 67 deletions(-) diff --git a/pkg/sql/catalog/catpb/catalog.proto b/pkg/sql/catalog/catpb/catalog.proto index bb5d9afaf522..5954d1f7323d 100644 --- a/pkg/sql/catalog/catpb/catalog.proto +++ b/pkg/sql/catalog/catpb/catalog.proto @@ -232,3 +232,23 @@ message AutoStatsSettings { // FractionStaleRows is table setting sql_stats_automatic_collection_fraction_stale_rows. optional double fraction_stale_rows = 3; } + +// InvertedIndexColumnKind is the kind of the inverted index on a column. The +// reason this needs to be stored is that we need to be able to check that the +// "opclass" passed into an inverted index declaration (for example, +// gin_trgm_ops) is compatible with the datatype of a particular column +// (gin_tgrm_ops is only valid on text). A future reason is that it's possible +// to desire having more than one type of inverted index on a particular +// datatype - for example, you might want to create a "stemming" inverted index +// on text. And without this extra kind, it wouldn't be possible to distinguish +// a text inverted index that uses trigrams, vs a text inverted index that uses +// stemming. +enum InvertedIndexColumnKind { + // DEFAULT is the default kind of inverted index column. JSON, Array, and + // geo inverted indexes all are DEFAULT, though prior to 22.2 they had no + // kind at all. + DEFAULT = 0; + // TRIGRAM is the trigram kind of inverted index column. It's only valid on + // text columns. + TRIGRAM = 1; +} diff --git a/pkg/sql/catalog/descpb/structured.proto b/pkg/sql/catalog/descpb/structured.proto index 22ec6f36c683..e3e3f2235fe5 100644 --- a/pkg/sql/catalog/descpb/structured.proto +++ b/pkg/sql/catalog/descpb/structured.proto @@ -362,6 +362,11 @@ message IndexDescriptor { // columns which are explicitly part of the index (STORING clause). repeated string store_column_names = 5; + // An ordered list of opclasses that parallels each of the inverted columns + // in the index. n.b.: currently, there can only be a single inverted column + // in an index, so this list will always be of size 0 or 1. + repeated cockroach.sql.catalog.catpb.InvertedIndexColumnKind inverted_column_kinds = 27; + // An ordered list of column IDs of which the index key is comprised. This // list parallels the key_column_names list and does not include any // additional stored columns. If the index is an inverted index, the last @@ -490,7 +495,7 @@ message IndexDescriptor { optional uint32 constraint_id = 26 [(gogoproto.customname) = "ConstraintID", (gogoproto.casttype) = "ConstraintID", (gogoproto.nullable) = false]; - // Next ID: 27 + // Next ID: 28 } // ConstraintToUpdate represents a constraint to be added to the table and diff --git a/pkg/sql/catalog/table_elements.go b/pkg/sql/catalog/table_elements.go index b632a5902597..ea5973ce1f67 100644 --- a/pkg/sql/catalog/table_elements.go +++ b/pkg/sql/catalog/table_elements.go @@ -185,6 +185,10 @@ type Index interface { // Panics if the index is not inverted. InvertedColumnKeyType() *types.T + // InvertedColumnKind returns the kind of the inverted column of the inverted + // index. + InvertedColumnKind() catpb.InvertedIndexColumnKind + NumPrimaryStoredColumns() int NumSecondaryStoredColumns() int GetStoredColumnID(storedColumnOrdinal int) descpb.ColumnID diff --git a/pkg/sql/catalog/tabledesc/index.go b/pkg/sql/catalog/tabledesc/index.go index 1c9aced84586..88863192f447 100644 --- a/pkg/sql/catalog/tabledesc/index.go +++ b/pkg/sql/catalog/tabledesc/index.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/iterutil" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) var _ catalog.Index = (*index)(nil) @@ -176,6 +177,22 @@ func (w index) InvertedColumnKeyType() *types.T { return w.desc.InvertedColumnKeyType() } +// InvertedColumnKind returns the kind of the inverted column of the inverted +// index. +// +// Panics if the index is not inverted. +func (w index) InvertedColumnKind() catpb.InvertedIndexColumnKind { + if w.desc.Type != descpb.IndexDescriptor_INVERTED { + panic(errors.AssertionFailedf("index is not inverted")) + } + if len(w.desc.InvertedColumnKinds) == 0 { + // Not every inverted index has kinds inside, since no kinds were set prior + // to version 22.2. + return catpb.InvertedIndexColumnKind_DEFAULT + } + return w.desc.InvertedColumnKinds[0] +} + // CollectKeyColumnIDs creates a new set containing the column IDs in the key // of this index. func (w index) CollectKeyColumnIDs() catalog.TableColSet { diff --git a/pkg/sql/catalog/tabledesc/structured.go b/pkg/sql/catalog/tabledesc/structured.go index 7a9825823f99..d9d3e32b835a 100644 --- a/pkg/sql/catalog/tabledesc/structured.go +++ b/pkg/sql/catalog/tabledesc/structured.go @@ -1085,16 +1085,7 @@ func checkColumnsValidForInvertedIndex(tableDesc *Mutable, indexColNames []strin // The last column indexed by an inverted index must be // inverted indexable. if i == lastCol && !colinfo.ColumnTypeIsInvertedIndexable(col.GetType()) { - return errors.WithHint( - pgerror.Newf( - pgcode.FeatureNotSupported, - "column %s of type %s is not allowed as the last column in an inverted index", - col.GetName(), - col.GetType().Name(), - ), - "see the documentation for more information about inverted indexes: "+docs.URL("inverted-indexes.html"), - ) - + return NewInvalidInvertedColumnError(col.GetName(), col.GetType().String()) } // Any preceding columns must not be inverted indexable. if i < lastCol && !colinfo.ColumnTypeIsIndexable(col.GetType()) { @@ -1114,6 +1105,19 @@ func checkColumnsValidForInvertedIndex(tableDesc *Mutable, indexColNames []strin return nil } +// NewInvalidInvertedColumnError returns an error for a column that's not +// inverted indexable. +func NewInvalidInvertedColumnError(colName, colType string) error { + return errors.WithHint( + pgerror.Newf( + pgcode.FeatureNotSupported, + "column %s of type %s is not allowed as the last column in an inverted index", + colName, colType, + ), + "see the documentation for more information about inverted indexes: "+docs.URL("inverted-indexes.html"), + ) +} + // AddColumn adds a column to the table. func (desc *Mutable) AddColumn(col *descpb.ColumnDescriptor) { desc.Columns = append(desc.Columns, *col) diff --git a/pkg/sql/catalog/tabledesc/validate_test.go b/pkg/sql/catalog/tabledesc/validate_test.go index ffaf4b6700e4..4bad8484d762 100644 --- a/pkg/sql/catalog/tabledesc/validate_test.go +++ b/pkg/sql/catalog/tabledesc/validate_test.go @@ -145,7 +145,8 @@ var validationMap = []struct { "StoreColumnNames": { status: todoIAmKnowinglyAddingTechDebt, reason: "initial import: TODO(features): add validation"}, - "KeyColumnIDs": {status: iSolemnlySwearThisFieldIsValidated}, + "InvertedColumnKinds": {status: thisFieldReferencesNoObjects}, + "KeyColumnIDs": {status: iSolemnlySwearThisFieldIsValidated}, "KeySuffixColumnIDs": { status: todoIAmKnowinglyAddingTechDebt, reason: "initial import: TODO(features): add validation"}, diff --git a/pkg/sql/create_index.go b/pkg/sql/create_index.go index b1781104ae01..67b13fe26f02 100644 --- a/pkg/sql/create_index.go +++ b/pkg/sql/create_index.go @@ -33,6 +33,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sqlerrors" "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented" "github.com/cockroachdb/cockroach/pkg/util/log/eventpb" "github.com/cockroachdb/errors" ) @@ -197,7 +198,9 @@ func makeIndexDescriptor( CreatedAtNanos: params.EvalContext().GetTxnTimestamp(time.Microsecond).UnixNano(), } + columnsToCheckForOpclass := columns if n.Inverted { + columnsToCheckForOpclass = columns[:len(columns)-1] if n.Sharded != nil { return nil, pgerror.New(pgcode.InvalidSQLStatementName, "inverted indexes don't support hash sharding") } @@ -211,19 +214,20 @@ func makeIndexDescriptor( } indexDesc.Type = descpb.IndexDescriptor_INVERTED - column, err := tableDesc.FindColumnWithName(columns[len(columns)-1].Column) + invCol := columns[len(columns)-1] + column, err := tableDesc.FindColumnWithName(invCol.Column) if err != nil { return nil, err } - switch column.GetType().Family() { - case types.GeometryFamily: - config, err := geoindex.GeometryIndexConfigForSRID(column.GetType().GeoSRIDOrZero()) - if err != nil { - return nil, err - } - indexDesc.GeoConfig = *config - case types.GeographyFamily: - indexDesc.GeoConfig = *geoindex.DefaultGeographyIndexConfig() + if err := populateInvertedIndexDescriptor(column, &indexDesc, invCol); err != nil { + return nil, err + } + } + + for i := range columnsToCheckForOpclass { + col := &columns[i] + if col.OpClass != "" { + return nil, newUndefinedOpclassError(col.OpClass) } } @@ -303,6 +307,66 @@ func makeIndexDescriptor( return &indexDesc, nil } +// populateInvertedIndexDescriptor adds information to the input index descriptor +// for the inverted index given by the input column and invCol, which should +// match (column is the catalog column, and invCol is the grammar node of +// the column in the index creation statement). +func populateInvertedIndexDescriptor( + column catalog.Column, indexDesc *descpb.IndexDescriptor, invCol tree.IndexElem, +) error { + indexDesc.InvertedColumnKinds = []catpb.InvertedIndexColumnKind{catpb.InvertedIndexColumnKind_DEFAULT} + switch column.GetType().Family() { + case types.ArrayFamily: + switch invCol.OpClass { + case "array_ops", "": + default: + return newUndefinedOpclassError(invCol.OpClass) + } + case types.JsonFamily: + switch invCol.OpClass { + case "jsonb_ops", "": + case "jsonb_path_ops": + return unimplemented.NewWithIssue(81115, "operator class \"jsonb_path_ops\" is not supported") + default: + return newUndefinedOpclassError(invCol.OpClass) + } + case types.GeometryFamily: + if invCol.OpClass != "" { + return newUndefinedOpclassError(invCol.OpClass) + } + config, err := geoindex.GeometryIndexConfigForSRID(column.GetType().GeoSRIDOrZero()) + if err != nil { + return err + } + indexDesc.GeoConfig = *config + case types.GeographyFamily: + if invCol.OpClass != "" { + return newUndefinedOpclassError(invCol.OpClass) + } + indexDesc.GeoConfig = *geoindex.DefaultGeographyIndexConfig() + case types.StringFamily: + // Check the opclass of the last column in the list, which is the column + // we're going to inverted index. + switch invCol.OpClass { + case "gin_trgm_ops", "gist_trgm_ops": + case "": + return errors.WithHint( + pgerror.New(pgcode.UndefinedObject, "data type text has no default operator class for access method \"gin\""), + "You must specify an operator class for the index (did you mean gin_trgm_ops?)") + default: + return newUndefinedOpclassError(invCol.OpClass) + } + indexDesc.InvertedColumnKinds[0] = catpb.InvertedIndexColumnKind_TRIGRAM + default: + return tabledesc.NewInvalidInvertedColumnError(column.GetName(), column.GetType().Name()) + } + return nil +} + +func newUndefinedOpclassError(opclass tree.Name) error { + return pgerror.Newf(pgcode.UndefinedObject, "operator class %q does not exist", opclass) +} + // validateColumnsAreAccessible validates that the columns for an index are // accessible. This check must be performed before creating inaccessible columns // for expression indexes with replaceExpressionElemsWithVirtualCols. diff --git a/pkg/sql/create_table.go b/pkg/sql/create_table.go index 666ce4947472..2be83d70a626 100644 --- a/pkg/sql/create_table.go +++ b/pkg/sql/create_table.go @@ -21,7 +21,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/docs" - "github.com/cockroachdb/cockroach/pkg/geo/geoindex" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/kv" @@ -1804,15 +1803,8 @@ func NewTableDesc( if err != nil { return nil, err } - switch column.GetType().Family() { - case types.GeometryFamily: - config, err := geoindex.GeometryIndexConfigForSRID(column.GetType().GeoSRIDOrZero()) - if err != nil { - return nil, err - } - idx.GeoConfig = *config - case types.GeographyFamily: - idx.GeoConfig = *geoindex.DefaultGeographyIndexConfig() + if err := populateInvertedIndexDescriptor(column, &idx, columns[len(columns)-1]); err != nil { + return nil, err } } diff --git a/pkg/sql/logictest/testdata/logic_test/create_index b/pkg/sql/logictest/testdata/logic_test/create_index index d79beaacbdb3..d034f743438b 100644 --- a/pkg/sql/logictest/testdata/logic_test/create_index +++ b/pkg/sql/logictest/testdata/logic_test/create_index @@ -390,3 +390,16 @@ CREATE TABLE public.t_hash ( UNIQUE INDEX idx_t_hash_b (b ASC) USING HASH WITH (bucket_count=5), FAMILY fam_0_pk_a_b (pk, a, b) ) + +statement ok +CREATE TABLE opclasses (a INT PRIMARY KEY, b TEXT, c JSON) + +# Make sure that we don't permit creating indexes with opclasses that aren't +# inverted. +statement error operator class "blah_ops" does not exist +CREATE INDEX ON opclasses(b blah_ops) + +# Make sure that we don't permit creating indexes with opclasses that aren't +# inverted, even if the type is invertible. +statement error operator class "blah_ops" does not exist +CREATE INDEX ON opclasses(c blah_ops) diff --git a/pkg/sql/logictest/testdata/logic_test/inverted_index b/pkg/sql/logictest/testdata/logic_test/inverted_index index 3e8fc2b14b31..79df6712a0bb 100644 --- a/pkg/sql/logictest/testdata/logic_test/inverted_index +++ b/pkg/sql/logictest/testdata/logic_test/inverted_index @@ -46,6 +46,22 @@ c CREATE TABLE public.c ( INVERTED INDEX "c_bAr_idx" ("bAr" ASC) ) +# Test that only the permitted opclasses are usable to make an inverted index. +statement error operator class \"blah_ops\" does not exist +CREATE INVERTED INDEX ON c(foo blah_ops) + +statement error operator class \"blah_ops\" does not exist +CREATE INDEX ON c USING GIN(foo blah_ops) + +statement error unimplemented: operator class "jsonb_path_ops" is not supported +CREATE INDEX ON c USING GIN(foo jsonb_path_ops) + +statement ok +CREATE INVERTED INDEX ON c(foo jsonb_ops) + +statement ok +CREATE INDEX ON c USING GIN(foo jsonb_ops) + # Regression test for #42944: make sure that mixed-case columns can be # inverted indexed. statement ok @@ -1163,6 +1179,16 @@ CREATE TABLE c ( FAMILY "primary" (id, foo, bar) ) +# Test that only the permitted opclasses are usable to make an inverted index. +statement error operator class \"blah_ops\" does not exist +CREATE INVERTED INDEX ON c(foo blah_ops) + +statement ok +CREATE INVERTED INDEX blorp ON c(foo array_ops) + +statement ok +DROP INDEX blorp + statement ok INSERT INTO c VALUES(0, NULL, NULL) diff --git a/pkg/sql/logictest/testdata/logic_test/inverted_index_geospatial b/pkg/sql/logictest/testdata/logic_test/inverted_index_geospatial index 941458593c7c..5effdfa03c7c 100644 --- a/pkg/sql/logictest/testdata/logic_test/inverted_index_geospatial +++ b/pkg/sql/logictest/testdata/logic_test/inverted_index_geospatial @@ -8,6 +8,14 @@ CREATE TABLE geo_table( INVERTED INDEX geom_index(geom) ) +# Test that only the permitted opclasses are usable to make an inverted index. +statement error operator class \"blah_ops\" does not exist +CREATE INVERTED INDEX ON geo_table(geom blah_ops) + +# Test that only the permitted opclasses are usable to make an inverted index. +statement error operator class \"blah_ops\" does not exist +CREATE INDEX ON geo_table USING GIST(geom blah_ops) + # Shapes with SRID 26918. We've taken small X, Y values and added 400,000 to the X coordinate # and 4,000,000 to the Y coordinate to place them inside the bounds of SRID 26918. statement ok diff --git a/pkg/sql/logictest/testdata/logic_test/trigram_indexes b/pkg/sql/logictest/testdata/logic_test/trigram_indexes index b77aaaf6a0b8..976185d1b595 100644 --- a/pkg/sql/logictest/testdata/logic_test/trigram_indexes +++ b/pkg/sql/logictest/testdata/logic_test/trigram_indexes @@ -1,9 +1,25 @@ statement ok CREATE TABLE a (a INT PRIMARY KEY, t TEXT) -statement ok +statement error data type text has no default operator class for access method \"gin\" CREATE INVERTED INDEX ON a(t) +statement error data type text has no default operator class for access method \"gin\" +CREATE INDEX ON a USING GIN(t) + +statement error operator class \"blah_ops\" does not exist +CREATE INVERTED INDEX ON a(t blah_ops) + +statement ok +CREATE INVERTED INDEX ON a(t gin_trgm_ops) + +statement ok +CREATE INDEX ON a USING GIN(t gin_trgm_ops) + +# Both gin_trgm_ops and gist_trgm_ops work. +statement ok +CREATE INDEX ON a USING GIST(t gist_trgm_ops) + statement ok INSERT INTO a VALUES (1, 'foozoopa'), (2, 'Foo'), diff --git a/pkg/sql/opt/exec/execbuilder/testdata/trigram_index b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index index 9dc58ff19220..2f95eb7483c6 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/trigram_index +++ b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index @@ -5,7 +5,7 @@ CREATE TABLE a ( a INT PRIMARY KEY, b TEXT, FAMILY (a,b), - INVERTED INDEX(b) + INVERTED INDEX(b gin_trgm_ops) ) query T diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 88166afb23c7..b86a5abeebcc 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -4583,7 +4583,7 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints( var hasNullValue, ok bool var values tree.Datums var distinctVals uint64 - invertedIndexableColumnType := colinfo.ColumnTypeIsOnlyInvertedIndexable(colType) + onlyInvertedIndexableColumnType := colinfo.ColumnTypeIsOnlyInvertedIndexable(colType) if distinctVals, ok = filterConstraint.CalculateMaxResults(sb.evalCtx, cols, cols); ok { // If the number of values is excessive, don't spend too much time building the histogram, // as it may slow down the query. @@ -4606,7 +4606,7 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints( // types of such columns (JSON, ARRAY, and spatial types) aren't likely to // occur in CHECK constraints. So, let's play it safe and don't create a // histogram directly on columns that have a data type which is - // InvertedIndexable since the possible user benefit of adding this + // OnlyInvertedIndexable since the possible user benefit of adding this // support seems low. // // Also, histogram building errors out when the number of samples is @@ -4615,7 +4615,7 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints( // not expect to see null values. If we do see a null, something may have // gone wrong, so do not build a histogram in this case either. useHistogram := sb.evalCtx.SessionData().OptimizerUseHistograms && numValues > 0 && - !hasNullValue && !invertedIndexableColumnType && int64(numValues) <= numRows + !hasNullValue && !onlyInvertedIndexableColumnType && int64(numValues) <= numRows if !useHistogram { if distinctVals > math.MaxInt32 { continue diff --git a/pkg/sql/parser/parse_test.go b/pkg/sql/parser/parse_test.go index c29ea5eaacca..98178fe63c10 100644 --- a/pkg/sql/parser/parse_test.go +++ b/pkg/sql/parser/parse_test.go @@ -525,9 +525,6 @@ func TestUnimplementedSyntax(t *testing.T) { {`CREATE INDEX a ON b USING SPGIST (c)`, 0, `index using spgist`, ``}, {`CREATE INDEX a ON b USING BRIN (c)`, 0, `index using brin`, ``}, - {`CREATE INDEX a ON b(c gin_trgm_ops)`, 41285, `index using gin_trgm_ops`, ``}, - {`CREATE INDEX a ON b(c gist_trgm_ops)`, 41285, `index using gist_trgm_ops`, ``}, - {`CREATE INDEX a ON b(c bobby)`, 47420, ``, ``}, {`CREATE INDEX a ON b(a NULLS LAST)`, 6224, ``, ``}, {`CREATE INDEX a ON b(a ASC NULLS LAST)`, 6224, ``, ``}, {`CREATE INDEX a ON b(a DESC NULLS FIRST)`, 6224, ``, ``}, diff --git a/pkg/sql/parser/sql.y b/pkg/sql/parser/sql.y index c7d61232d77c..bb8a26fd02ba 100644 --- a/pkg/sql/parser/sql.y +++ b/pkg/sql/parser/sql.y @@ -8733,12 +8733,6 @@ index_elem_options: opClass := $1 dir := $2.dir() nullsOrder := $3.nullsOrder() - if opClass != "" { - if opClass == "gin_trgm_ops" || opClass == "gist_trgm_ops" { - return unimplementedWithIssueDetail(sqllex, 41285, "index using " + opClass) - } - return unimplementedWithIssue(sqllex, 47420) - } // We currently only support the opposite of Postgres defaults. if nullsOrder != tree.DefaultNullsOrder { if dir == tree.Descending && nullsOrder == tree.NullsFirst { @@ -8748,7 +8742,7 @@ index_elem_options: return unimplementedWithIssue(sqllex, 6224) } } - $$.val = tree.IndexElem{Direction: dir, NullsOrder: nullsOrder} + $$.val = tree.IndexElem{Direction: dir, NullsOrder: nullsOrder, OpClass: tree.Name(opClass)} } opt_class: diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go index 1e80a26deeed..9831bddb62ca 100644 --- a/pkg/sql/rowenc/index_encoding.go +++ b/pkg/sql/rowenc/index_encoding.go @@ -580,9 +580,9 @@ func EncodeInvertedIndexTableKeys( case types.ArrayFamily: return encodeArrayInvertedIndexTableKeys(val.(*tree.DArray), inKey, version, false /* excludeNulls */) case types.StringFamily: - // TODO(jordan): Right now, this is just trigram inverted indexes. What if - // we want to support different types of inverted indexes on strings? We'll - // need to pass in the index type to this function. + // TODO(jordan): Right now, this is just trigram inverted indexes. If we + // want to support different types of inverted indexes on strings, we'll + // need to pass in the inverted index column kind to this function. // We pad the keys when writing them to the index. // TODO(jordan): why are we doing this padding at all? Postgres does it. return encodeTrigramInvertedIndexTableKeys(string(*val.(*tree.DString)), inKey, version, true /* pad */) diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 3d27fcaad6ee..d2267cdfff70 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -5542,8 +5542,9 @@ value if you rely on the HLC for accuracy.`, }, ReturnType: tree.FixedReturnType(types.Int), Fn: func(ctx *eval.Context, args tree.Datums) (tree.Datum, error) { - // TODO(jordan): need to re-evaluate when we support more than just - // trigram inverted indexes. + // TODO(jordan): if we support inverted indexes on more than just trigram + // indexes, we will need to thread the inverted index kind through + // the backfiller into this function. // The version argument is currently ignored for string inverted indexes. if args[0] == tree.DNull { return tree.DZero, nil @@ -5565,7 +5566,8 @@ value if you rely on the HLC for accuracy.`, }, Info: "This function is used only by CockroachDB's developers for testing purposes.", Volatility: volatility.Stable, - }), + }, + ), // Returns true iff the current user has admin role. // Note: it would be a privacy leak to extend this to check arbitrary usernames. diff --git a/pkg/sql/sem/tree/create.go b/pkg/sql/sem/tree/create.go index 57b051a75805..b1e9a9312c79 100644 --- a/pkg/sql/sem/tree/create.go +++ b/pkg/sql/sem/tree/create.go @@ -130,6 +130,9 @@ type IndexElem struct { Expr Expr Direction Direction NullsOrder NullsOrder + + // OpClass is set if an index element was created using a named opclass. + OpClass Name } // Format implements the NodeFormatter interface. @@ -148,6 +151,10 @@ func (node *IndexElem) Format(ctx *FmtCtx) { ctx.WriteByte(')') } } + if node.OpClass != "" { + ctx.WriteByte(' ') + ctx.WriteString(node.OpClass.String()) + } if node.Direction != DefaultDirection { ctx.WriteByte(' ') ctx.WriteString(node.Direction.String()) @@ -170,6 +177,9 @@ func (node *IndexElem) doc(p *PrettyCfg) pretty.Doc { d = p.bracket("(", d, ")") } } + if node.OpClass != "" { + d = pretty.ConcatSpace(d, pretty.Text(node.OpClass.String())) + } if node.Direction != DefaultDirection { d = pretty.ConcatSpace(d, pretty.Keyword(node.Direction.String())) } diff --git a/pkg/sql/testdata/telemetry/trigrams b/pkg/sql/testdata/telemetry/trigrams index 9db88fdcda9b..4187290fc730 100644 --- a/pkg/sql/testdata/telemetry/trigrams +++ b/pkg/sql/testdata/telemetry/trigrams @@ -23,17 +23,3 @@ unimplemented.#41285.set_limit exec CREATE TABLE a(a text) ---- - -feature-usage -CREATE INDEX ON a USING GIST(a gin_trgm_ops) ----- -error: pq: at or near ")": syntax error: unimplemented: this syntax -unimplemented.#41285.index using gin_trgm_ops -unimplemented.syntax.#41285.index using gin_trgm_ops - -feature-usage -CREATE INDEX ON a USING GIST(a gist_trgm_ops) ----- -error: pq: at or near ")": syntax error: unimplemented: this syntax -unimplemented.#41285.index using gist_trgm_ops -unimplemented.syntax.#41285.index using gist_trgm_ops From ddf7f76d0e483b42153219accc883b5bc3af005b Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 20 May 2022 15:48:15 -0400 Subject: [PATCH 3/4] sql: version gate inverted index creation Make sure that it's not possible to create an inverted index until the cluster is at a new enough version. Release note: None --- docs/generated/settings/settings-for-tenants.txt | 2 +- docs/generated/settings/settings.html | 2 +- pkg/clusterversion/cockroach_versions.go | 8 ++++++++ pkg/clusterversion/key_string.go | 5 +++-- pkg/sql/create_index.go | 16 ++++++++++++++-- pkg/sql/create_table.go | 3 ++- 6 files changed, 29 insertions(+), 7 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index f80571cb0ee0..259e753bbb78 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -282,4 +282,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collector string the address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -version version 22.1-8 set the active cluster version in the format '.' +version version 22.1-10 set the active cluster version in the format '.' diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 22bcdcdc5f2a..000a7e358997 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -213,6 +213,6 @@ trace.opentelemetry.collectorstringaddress of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabledbooleantrueif set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collectorstringthe address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -versionversion22.1-8set the active cluster version in the format '.' +versionversion22.1-10set the active cluster version in the format '.' diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index db06df6b7638..657798412435 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -366,6 +366,10 @@ const ( // keys at the Pebble layer. EnablePebbleFormatVersionRangeKeys + // TrigramInvertedIndexes enables the creation of trigram inverted indexes + // on strings. + TrigramInvertedIndexes + // ************************************************* // Step (1): Add new versions here. // Do not add new versions to a patch release. @@ -638,6 +642,10 @@ var versionsSingleton = keyedVersions{ Key: EnablePebbleFormatVersionRangeKeys, Version: roachpb.Version{Major: 22, Minor: 1, Internal: 8}, }, + { + Key: TrigramInvertedIndexes, + Version: roachpb.Version{Major: 22, Minor: 1, Internal: 10}, + }, // ************************************************* // Step (2): Add new versions here. diff --git a/pkg/clusterversion/key_string.go b/pkg/clusterversion/key_string.go index 12cb3c011ba6..21121db4e19d 100644 --- a/pkg/clusterversion/key_string.go +++ b/pkg/clusterversion/key_string.go @@ -67,11 +67,12 @@ func _() { _ = x[LocalTimestamps-56] _ = x[EnsurePebbleFormatVersionRangeKeys-57] _ = x[EnablePebbleFormatVersionRangeKeys-58] + _ = x[TrigramInvertedIndexes-59] } -const _Key_name = "V21_2Start22_1TargetBytesAvoidExcessAvoidDrainingNamesDrainingNamesMigrationTraceIDDoesntImplyStructuredRecordingAlterSystemTableStatisticsAddAvgSizeColAlterSystemStmtDiagReqsMVCCAddSSTableInsertPublicSchemaNamespaceEntryOnRestoreUnsplitRangesInAsyncGCJobsValidateGrantOptionPebbleFormatBlockPropertyCollectorProbeRequestSelectRPCsTakeTracingInfoInbandPreSeedTenantSpanConfigsSeedTenantSpanConfigsPublicSchemasWithDescriptorsEnsureSpanConfigReconciliationEnsureSpanConfigSubscriptionEnableSpanConfigStoreScanWholeRowsSCRAMAuthenticationUnsafeLossOfQuorumRecoveryRangeLogAlterSystemProtectedTimestampAddColumnEnableProtectedTimestampsForTenantDeleteCommentsWithDroppedIndexesRemoveIncompatibleDatabasePrivilegesAddRaftAppliedIndexTermMigrationPostAddRaftAppliedIndexTermMigrationDontProposeWriteTimestampForLeaseTransfersTenantSettingsTableEnablePebbleFormatVersionBlockPropertiesDisableSystemConfigGossipTriggerMVCCIndexBackfillerEnableLeaseHolderRemovalBackupResolutionInJobLooselyCoupledRaftLogTruncationChangefeedIdlenessBackupDoesNotOverwriteLatestAndCheckpointEnableDeclarativeSchemaChangerRowLevelTTLPebbleFormatSplitUserKeysMarkedIncrementalBackupSubdirDateStyleIntervalStyleCastRewriteEnableNewStoreRebalancerClusterLocksVirtualTableAutoStatsTableSettingsForecastStatsSuperRegionsEnableNewChangefeedOptionsSpanCountTablePreSeedSpanCountTableSeedSpanCountTableV22_1Start22_2LocalTimestampsEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeys" +const _Key_name = "V21_2Start22_1TargetBytesAvoidExcessAvoidDrainingNamesDrainingNamesMigrationTraceIDDoesntImplyStructuredRecordingAlterSystemTableStatisticsAddAvgSizeColAlterSystemStmtDiagReqsMVCCAddSSTableInsertPublicSchemaNamespaceEntryOnRestoreUnsplitRangesInAsyncGCJobsValidateGrantOptionPebbleFormatBlockPropertyCollectorProbeRequestSelectRPCsTakeTracingInfoInbandPreSeedTenantSpanConfigsSeedTenantSpanConfigsPublicSchemasWithDescriptorsEnsureSpanConfigReconciliationEnsureSpanConfigSubscriptionEnableSpanConfigStoreScanWholeRowsSCRAMAuthenticationUnsafeLossOfQuorumRecoveryRangeLogAlterSystemProtectedTimestampAddColumnEnableProtectedTimestampsForTenantDeleteCommentsWithDroppedIndexesRemoveIncompatibleDatabasePrivilegesAddRaftAppliedIndexTermMigrationPostAddRaftAppliedIndexTermMigrationDontProposeWriteTimestampForLeaseTransfersTenantSettingsTableEnablePebbleFormatVersionBlockPropertiesDisableSystemConfigGossipTriggerMVCCIndexBackfillerEnableLeaseHolderRemovalBackupResolutionInJobLooselyCoupledRaftLogTruncationChangefeedIdlenessBackupDoesNotOverwriteLatestAndCheckpointEnableDeclarativeSchemaChangerRowLevelTTLPebbleFormatSplitUserKeysMarkedIncrementalBackupSubdirDateStyleIntervalStyleCastRewriteEnableNewStoreRebalancerClusterLocksVirtualTableAutoStatsTableSettingsForecastStatsSuperRegionsEnableNewChangefeedOptionsSpanCountTablePreSeedSpanCountTableSeedSpanCountTableV22_1Start22_2LocalTimestampsEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeysTrigramInvertedIndexes" -var _Key_index = [...]uint16{0, 5, 14, 36, 54, 76, 113, 152, 175, 189, 230, 256, 275, 309, 321, 352, 376, 397, 425, 455, 483, 504, 517, 536, 570, 608, 642, 674, 710, 742, 778, 820, 839, 879, 911, 930, 954, 975, 1006, 1024, 1065, 1095, 1106, 1137, 1160, 1193, 1217, 1241, 1263, 1276, 1288, 1314, 1328, 1349, 1367, 1372, 1381, 1396, 1430, 1464} +var _Key_index = [...]uint16{0, 5, 14, 36, 54, 76, 113, 152, 175, 189, 230, 256, 275, 309, 321, 352, 376, 397, 425, 455, 483, 504, 517, 536, 570, 608, 642, 674, 710, 742, 778, 820, 839, 879, 911, 930, 954, 975, 1006, 1024, 1065, 1095, 1106, 1137, 1160, 1193, 1217, 1241, 1263, 1276, 1288, 1314, 1328, 1349, 1367, 1372, 1381, 1396, 1430, 1464, 1486} func (i Key) String() string { if i < 0 || i >= Key(len(_Key_index)-1) { diff --git a/pkg/sql/create_index.go b/pkg/sql/create_index.go index 67b13fe26f02..e7cb976443ae 100644 --- a/pkg/sql/create_index.go +++ b/pkg/sql/create_index.go @@ -14,9 +14,11 @@ import ( "context" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/docs" "github.com/cockroachdb/cockroach/pkg/geo/geoindex" "github.com/cockroachdb/cockroach/pkg/server/telemetry" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" "github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo" @@ -219,7 +221,8 @@ func makeIndexDescriptor( if err != nil { return nil, err } - if err := populateInvertedIndexDescriptor(column, &indexDesc, invCol); err != nil { + if err := populateInvertedIndexDescriptor( + params.ctx, params.ExecCfg().Settings, column, &indexDesc, invCol); err != nil { return nil, err } } @@ -312,7 +315,11 @@ func makeIndexDescriptor( // match (column is the catalog column, and invCol is the grammar node of // the column in the index creation statement). func populateInvertedIndexDescriptor( - column catalog.Column, indexDesc *descpb.IndexDescriptor, invCol tree.IndexElem, + ctx context.Context, + cs *cluster.Settings, + column catalog.Column, + indexDesc *descpb.IndexDescriptor, + invCol tree.IndexElem, ) error { indexDesc.InvertedColumnKinds = []catpb.InvertedIndexColumnKind{catpb.InvertedIndexColumnKind_DEFAULT} switch column.GetType().Family() { @@ -349,6 +356,11 @@ func populateInvertedIndexDescriptor( // we're going to inverted index. switch invCol.OpClass { case "gin_trgm_ops", "gist_trgm_ops": + if !cs.Version.IsActive(ctx, clusterversion.TrigramInvertedIndexes) { + return pgerror.Newf(pgcode.FeatureNotSupported, + "version %v must be finalized to create trigram inverted indexes", + clusterversion.ByKey(clusterversion.TrigramInvertedIndexes)) + } case "": return errors.WithHint( pgerror.New(pgcode.UndefinedObject, "data type text has no default operator class for access method \"gin\""), diff --git a/pkg/sql/create_table.go b/pkg/sql/create_table.go index 2be83d70a626..3101c7685a50 100644 --- a/pkg/sql/create_table.go +++ b/pkg/sql/create_table.go @@ -1803,7 +1803,8 @@ func NewTableDesc( if err != nil { return nil, err } - if err := populateInvertedIndexDescriptor(column, &idx, columns[len(columns)-1]); err != nil { + if err := populateInvertedIndexDescriptor( + ctx, evalCtx.Settings, column, &idx, columns[len(columns)-1]); err != nil { return nil, err } } From b9b8033e0d20f6cc951cd81c5f34807393051cce Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 27 May 2022 18:40:17 -0400 Subject: [PATCH 4/4] invertedidx: add test for trigram span creation Release note: None --- pkg/sql/opt/invertedidx/trigram_test.go | 129 ++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 pkg/sql/opt/invertedidx/trigram_test.go diff --git a/pkg/sql/opt/invertedidx/trigram_test.go b/pkg/sql/opt/invertedidx/trigram_test.go new file mode 100644 index 000000000000..95d9504aee2f --- /dev/null +++ b/pkg/sql/opt/invertedidx/trigram_test.go @@ -0,0 +1,129 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx_test + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx" + "github.com/cockroachdb/cockroach/pkg/sql/opt/norm" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils/testcat" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/stretchr/testify/require" +) + +func TestTryFilterTrigram(t *testing.T) { + semaCtx := tree.MakeSemaContext() + st := cluster.MakeTestingClusterSettings() + evalCtx := eval.NewTestingEvalContext(st) + + tc := testcat.New() + if _, err := tc.ExecuteDDL( + "CREATE TABLE t (s STRING, INVERTED INDEX (s gin_trgm_ops))", + ); err != nil { + t.Fatal(err) + } + var f norm.Factory + f.Init(evalCtx, tc) + md := f.Metadata() + tn := tree.NewUnqualifiedTableName("t") + tab := md.AddTable(tc.Table(tn), tn) + trigramOrd := 1 + + // If we can create an inverted filter with the given filter expression and + // index, ok=true. If the spans in the resulting inverted index constraint + // do not have duplicate primary keys, unique=true. If the spans are tight, + // tight=true and remainingFilters="". Otherwise, tight is false and + // remainingFilters contains some or all of the original filters. + testCases := []struct { + filters string + ok bool + unique bool + }{ + // Test LIKE with percents on both sides + // TODO(jordan): we could make expressions with just a single trigram + // tight, because we would know for sure that we wouldn't need to recheck + // the condition once the row is returned. But, it's probably not that + // important of an optimization. + {filters: "s LIKE '%foo%'", ok: true, unique: true}, + {filters: "s LIKE '%blorp%'", ok: true, unique: false}, + {filters: "s LIKE 'foo%'", ok: true, unique: true}, + {filters: "s LIKE 'blorp%'", ok: true, unique: false}, + {filters: "s ILIKE '%foo%'", ok: true, unique: true}, + {filters: "s ILIKE '%blorp%'", ok: true, unique: false}, + // Queries that are too short to have any trigrams do not produce filters. + {filters: "s LIKE '%fo%'", ok: false}, + {filters: "s ILIKE '%fo%'", ok: false}, + {filters: "s LIKE '%fo%ab%ru%'", ok: false}, + + // AND and OR for two LIKE queries behave as expected. + {filters: "s LIKE '%lkj%' AND s LIKE '%bla%'", ok: true, unique: true}, + {filters: "s LIKE '%lkj%' OR s LIKE '%bla%'", ok: true, unique: false}, + + // Similarity queries. + {filters: "s % 'lkjsdlkj'", ok: true, unique: false}, + {filters: "s % 'lkj'", ok: true, unique: true}, + // Can't generate trigrams from such a short constant. + {filters: "s % 'lj'", ok: false}, + + // AND and OR for two similarity queries behave as expected. + {filters: "s % 'lkj' AND s % 'bla'", ok: true, unique: true}, + {filters: "s % 'lkj' OR s % 'bla'", ok: true, unique: false}, + + // Can combine similarity and LIKE queries and still get inverted + // expressions. + {filters: "s % 'lkj' AND s LIKE 'blort'", ok: true, unique: false}, + {filters: "s % 'lkj' OR s LIKE 'blort'", ok: true, unique: false}, + + // Equality queries. + {filters: "s = 'lkjsdlkj'", ok: true, unique: false}, + {filters: "s = 'lkj'", ok: true, unique: true}, + {filters: "s = 'lkj' OR s LIKE 'blah'", ok: true, unique: false}, + } + + for _, tc := range testCases { + t.Logf("test case: %v", tc) + filters := testutils.BuildFilters(t, &f, &semaCtx, evalCtx, tc.filters) + + // We're not testing that the correct SpanExpression is returned here; + // that is tested elsewhere. This is just testing that we are constraining + // the index when we expect to and we have the correct values for tight, + // unique, and remainingFilters. + spanExpr, _, remainingFilters, _, ok := invertedidx.TryFilterInvertedIndex( + evalCtx, + &f, + filters, + nil, /* optionalFilters */ + tab, + md.Table(tab).Index(trigramOrd), + nil, /* computedColumns */ + ) + if tc.ok != ok { + t.Fatalf("expected %v, got %v", tc.ok, ok) + } + if !ok { + continue + } + + if spanExpr.Tight { + t.Fatalf("We never expected our inverted expression to be tight") + } + if tc.unique != spanExpr.Unique { + t.Fatalf("For (%s), expected unique=%v, but got %v", tc.filters, tc.unique, spanExpr.Unique) + } + + require.Equal(t, filters.String(), remainingFilters.String(), + "mismatched remaining filters") + } +}