From a9404c6a14e3283e1cbd2843d54c06bdb87ef2fc Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 15 Apr 2022 17:28:05 -0400 Subject: [PATCH] opt: accelerate % with text inv indexes Teach the inverted filter planner to support the text % text operator on inverted text indexes. Release note: None --- .../testdata/logic_test/trigram_indexes | 25 ++++ .../exec/execbuilder/testdata/trigram_index | 111 ++++++++++++++++++ pkg/sql/opt/invertedidx/trigram.go | 18 ++- pkg/sql/rowenc/index_encoding.go | 14 ++- 4 files changed, 163 insertions(+), 5 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/trigram_indexes b/pkg/sql/logictest/testdata/logic_test/trigram_indexes index ee4499d26c3f..58f546e2819f 100644 --- a/pkg/sql/logictest/testdata/logic_test/trigram_indexes +++ b/pkg/sql/logictest/testdata/logic_test/trigram_indexes @@ -47,3 +47,28 @@ SELECT * FROM a@a_t_idx WHERE t LIKE 'blahf' query IT SELECT * FROM a@a_t_idx WHERE t LIKE 'fblah' ---- + +# Test the acceleration of the % similarity operator. +# By default, the threshold for searching is .3. +query FIT +SELECT similarity(t, 'blar'), * FROM a@a_t_idx WHERE t % 'blar' +---- +0.428571432828903 3 blah + +query FIT +SELECT similarity(t, 'byar'), * FROM a@a_t_idx WHERE t % 'byar' +---- + +query FIT +SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz' ORDER BY a +---- +0.400000005960464 1 foozoopa +0.5 2 Foo + +statement ok +SET pg_trgm.similarity_threshold=.45 + +query FIT +SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz' +---- +0.5 2 Foo diff --git a/pkg/sql/opt/exec/execbuilder/testdata/trigram_index b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index index a109ef3d4191..0c13c2279d2f 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/trigram_index +++ b/pkg/sql/opt/exec/execbuilder/testdata/trigram_index @@ -118,3 +118,114 @@ vectorized: true missing stats table: a@a_pkey spans: FULL SCAN + +# Test that trigram indexes can't support searches with no constant args. +# columns. +query T +EXPLAIN SELECT * FROM a WHERE b LIKE b +---- +distribution: local +vectorized: true +· +• filter +│ filter: b LIKE b +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + +# Test that trigram indexes accelerate the % operator. +query T +EXPLAIN SELECT * FROM a WHERE b % 'foo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'foo' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +# Test that trigram indexes accelerate the % operator with an OR if the +# constant has more than one trigram. +query T +EXPLAIN SELECT * FROM a WHERE b % 'foob' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'foob' +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +# Test that trigram indexes can't accelerate the % operator if there are fewer +# than 3 characters in the constant. +query T +EXPLAIN SELECT * FROM a WHERE b % 'fo' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % 'fo' +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN + +# Test that trigram indexes can accelerate the % operator in reverse order. +query T +EXPLAIN SELECT * FROM a WHERE 'blah' % b +---- +distribution: local +vectorized: true +· +• filter +│ filter: 'blah' % b +│ +└── • index join + │ table: a@a_pkey + │ + └── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +# Test that trigram indexes can't accelerate the % operator with no constant +# columns. +query T +EXPLAIN SELECT * FROM a WHERE b % b +---- +distribution: local +vectorized: true +· +• filter +│ filter: b % b +│ +└── • scan + missing stats + table: a@a_pkey + spans: FULL SCAN diff --git a/pkg/sql/opt/invertedidx/trigram.go b/pkg/sql/opt/invertedidx/trigram.go index 0a9159e8a374..320fd0e9385d 100644 --- a/pkg/sql/opt/invertedidx/trigram.go +++ b/pkg/sql/opt/invertedidx/trigram.go @@ -38,13 +38,26 @@ func (t *trigramFilterPlanner) extractInvertedFilterConditionFromLeaf( ) { var constantVal opt.ScalarExpr var left, right opt.ScalarExpr + var allTrigramsMustMatch bool switch e := expr.(type) { // Both ILIKE and LIKE are supported because the index entries are always // downcased. We re-check the condition no matter what later. case *memo.ILikeExpr: left, right = e.Left, e.Right + // If we're doing a LIKE (or ILIKE) expression, we need to construct an AND out of all + // of the spans: we need to find results that match every single one of the + // trigrams in the constant datum. + allTrigramsMustMatch = true case *memo.LikeExpr: left, right = e.Left, e.Right + allTrigramsMustMatch = true + case *memo.ModExpr: + // If we're doing a % expression (similarity threshold), we need to construct + // an OR out of the spans: we need to find results that match any of the + // trigrams in the constant datum, and we'll filter the results further + // afterwards. + left, right = e.Left, e.Right + allTrigramsMustMatch = false default: // Only the above types are supported. return inverted.NonInvertedColExpression{}, expr, nil @@ -53,6 +66,9 @@ func (t *trigramFilterPlanner) extractInvertedFilterConditionFromLeaf( constantVal = right } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) { constantVal = left + } else { + // Can only accelerate with a single constant value. + return inverted.NonInvertedColExpression{}, expr, nil } d := memo.ExtractConstDatum(constantVal) if d.ResolvedType() != types.String { @@ -61,7 +77,7 @@ func (t *trigramFilterPlanner) extractInvertedFilterConditionFromLeaf( )) } var err error - invertedExpr, err = rowenc.EncodeLikeTrigramSpans(d.(*tree.DString)) + invertedExpr, err = rowenc.EncodeTrigramSpans(d.(*tree.DString), allTrigramsMustMatch) if err != nil { // An inverted expression could not be extracted. return inverted.NonInvertedColExpression{}, expr, nil diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go index c076ec2b5d1b..4b385c2a8c8c 100644 --- a/pkg/sql/rowenc/index_encoding.go +++ b/pkg/sql/rowenc/index_encoding.go @@ -840,9 +840,11 @@ func encodeOverlapsArrayInvertedIndexSpans( return invertedExpr, nil } -// EncodeLikeTrigramSpans returns the spans that must be scanned to look up all -// trigrams present in the input string. -func EncodeLikeTrigramSpans(val *tree.DString) (inverted.Expression, error) { +// EncodeTrigramSpans returns the spans that must be scanned to look up trigrams +// present in the input string. If allMustMatch is true, the resultant inverted +// expression must match every trigram in the input. Otherwise, it will match +// any trigram in the input. +func EncodeTrigramSpans(val *tree.DString, allMustMatch bool) (inverted.Expression, error) { chunks := strings.Split(string(*val), "%") // Each chunk will have at minimum 2 trigrams, so start at that default size. keys := make([][]byte, 0, len(chunks)*2) @@ -876,7 +878,11 @@ func EncodeLikeTrigramSpans(val *tree.DString) (inverted.Expression, error) { if ret == nil { ret = spanExpr } else { - ret = inverted.And(ret, spanExpr) + if allMustMatch { + ret = inverted.And(ret, spanExpr) + } else { + ret = inverted.Or(ret, spanExpr) + } } } // The result is never tight. We always need to re-check the condition once