From 22f65532948316e8ea9221632c66e3de64691cea Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Sun, 24 Jul 2022 21:51:02 -0400 Subject: [PATCH] sql: inv idx accelerate tsvector@@tsquery queries This commit adds inverted index acceleration for expressions that evaluate a tsquery against a tsvector using the `@@` operator. Release note (sql change): it's now possible to run efficient tsvector @@ tsquery searches when there is an inverted index on the tsvector column being searched. --- pkg/sql/inverted/expression.go | 4 + .../logictest/testdata/logic_test/tsvector | 96 ++++++++++ .../exec/execbuilder/testdata/tsvector_index | 171 ++++++++++++++++++ .../execbuilder/tests/local/generated_test.go | 7 + pkg/sql/opt/invertedidx/BUILD.bazel | 2 + .../opt/invertedidx/inverted_index_expr.go | 13 +- pkg/sql/opt/invertedidx/tsearch.go | 87 +++++++++ pkg/sql/opt/invertedidx/tsearch_test.go | 132 ++++++++++++++ pkg/sql/opt/xform/select_funcs.go | 4 + pkg/util/tsearch/BUILD.bazel | 3 + pkg/util/tsearch/encoding.go | 14 +- pkg/util/tsearch/encoding_test.go | 142 +++++++++++++++ pkg/util/tsearch/random.go | 35 +++- pkg/util/tsearch/tsquery.go | 101 +++++++++++ 14 files changed, 800 insertions(+), 11 deletions(-) create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/tsvector_index create mode 100644 pkg/sql/opt/invertedidx/tsearch.go create mode 100644 pkg/sql/opt/invertedidx/tsearch_test.go diff --git a/pkg/sql/inverted/expression.go b/pkg/sql/inverted/expression.go index 6d4ab4ac3e5d..2831113bf48b 100644 --- a/pkg/sql/inverted/expression.go +++ b/pkg/sql/inverted/expression.go @@ -305,6 +305,10 @@ type SpanExpression struct { // JSON or Array SpanExpressions, and it holds when unique SpanExpressions // are combined with And. It does not hold when these SpanExpressions are // combined with Or. + // + // Note that the uniqueness property represented here holds for the *output* + // of the invertedFilter operator that executes the And/Or, not the raw, input + // spans of data. Unique bool // SpansToRead are the spans to read from the inverted index diff --git a/pkg/sql/logictest/testdata/logic_test/tsvector b/pkg/sql/logictest/testdata/logic_test/tsvector index 71398395526b..65ffb7d1428c 100644 --- a/pkg/sql/logictest/testdata/logic_test/tsvector +++ b/pkg/sql/logictest/testdata/logic_test/tsvector @@ -139,3 +139,99 @@ query T VALUES ( json_build_array($$'cat' & 'rat'$$:::TSQUERY)::JSONB) ---- ["'cat' & 'rat'"] + +# Test tsvector inverted indexes. +statement ok +DROP TABLE a; +CREATE TABLE a ( + a TSVECTOR, + b TSQUERY, + INVERTED INDEX(a) +); +INSERT INTO a VALUES('foo:3 bar:4,5'), ('baz:1'), ('foo:3'), ('bar:2') + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo' +---- +'bar':4,5 'foo':3 +'foo':3 + +statement error index \"a_a_idx\" is inverted and cannot be used for this query +SELECT a FROM a@a_a_idx WHERE a @@ '!foo' + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo' OR a @@ 'bar' +---- +'bar':4,5 'foo':3 +'foo':3 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' +---- +'bar':4,5 'foo':3 +'foo':3 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' OR a @@ 'baz' +---- +'bar':4,5 'foo':3 +'baz':1 +'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo & bar' +---- +'bar':4,5 'foo':3 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> bar' +---- +'bar':4,5 'foo':3 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'bar <-> foo' +---- + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> !bar' +---- +'foo':3 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ '!baz <-> bar' +---- +'bar':4,5 'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo & !bar' +---- +'foo':3 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:*' +---- +'bar':4,5 'foo':3 +'baz':1 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* | foo' +---- +'bar':4,5 'foo':3 +'baz':1 +'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* & foo' +---- +'bar':4,5 'foo':3 + +# Test that tsvector indexes can't accelerate the @@ operator with no constant +# columns. +statement error index \"a_a_idx\" is inverted and cannot be used for this query +EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b diff --git a/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index new file mode 100644 index 000000000000..ee8b0bd9c576 --- /dev/null +++ b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index @@ -0,0 +1,171 @@ +# LogicTest: local + +statement ok +CREATE TABLE a ( + a INT PRIMARY KEY, + b TSVECTOR, + c TSQUERY, + FAMILY (a,b,c), + INVERTED INDEX(b) +) + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 3 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 3 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar' +---- +distribution: local +vectorized: true +· +• lookup join +│ table: a@a_pkey +│ equality: (a) = (a) +│ equality cols are key +│ pred: b @@ '''foo'' & ''bar''' +│ +└── • zigzag join + left table: a@a_b_idx + left columns: (a, b_inverted_key) + left fixed values: 1 column + right table: a@a_b_idx + right columns: (a, b_inverted_key) + right fixed values: 1 column + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar' +---- +distribution: local +vectorized: true +· +• lookup join +│ table: a@a_pkey +│ equality: (a) = (a) +│ equality cols are key +│ pred: b @@ '''foo'' <-> ''bar''' +│ +└── • zigzag join + left table: a@a_b_idx + left columns: (a, b_inverted_key) + left fixed values: 1 column + right table: a@a_b_idx + right columns: (a, b_inverted_key) + right fixed values: 1 column + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b @@ '''foo'' & !''bar''' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + + +query T +EXPLAIN SELECT a FROM a@a_b_idx WHERE b @@ 'ba:*' +---- +distribution: local +vectorized: true +· +• inverted filter +│ inverted column: b_inverted_key +│ num spans: 1 +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + + +# Test that tsvector indexes can't accelerate the @@ operator with no constant +# columns. +statement error index \"a_b_idx\" is inverted and cannot be used for this query +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go index ffac05a8f9f0..a2832beb63b7 100644 --- a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go +++ b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go @@ -550,6 +550,13 @@ func TestExecBuild_trigram_index( runExecBuildLogicTest(t, "trigram_index") } +func TestExecBuild_tsvector_index( + t *testing.T, +) { + defer leaktest.AfterTest(t)() + runExecBuildLogicTest(t, "tsvector_index") +} + func TestExecBuild_tuple( t *testing.T, ) { diff --git a/pkg/sql/opt/invertedidx/BUILD.bazel b/pkg/sql/opt/invertedidx/BUILD.bazel index 8730f991ec61..7fa01ba26944 100644 --- a/pkg/sql/opt/invertedidx/BUILD.bazel +++ b/pkg/sql/opt/invertedidx/BUILD.bazel @@ -8,6 +8,7 @@ go_library( "inverted_index_expr.go", "json_array.go", "trigram.go", + "tsearch.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx", visibility = ["//visibility:public"], @@ -49,6 +50,7 @@ go_test( "geo_test.go", "json_array_test.go", "trigram_test.go", + "tsearch_test.go", ], args = ["-test.timeout=55s"], deps = [ diff --git a/pkg/sql/opt/invertedidx/inverted_index_expr.go b/pkg/sql/opt/invertedidx/inverted_index_expr.go index eec56289d01b..6db743491e87 100644 --- a/pkg/sql/opt/invertedidx/inverted_index_expr.go +++ b/pkg/sql/opt/invertedidx/inverted_index_expr.go @@ -116,18 +116,27 @@ func TryFilterInvertedIndex( } else { col := index.InvertedColumn().InvertedSourceColumnOrdinal() typ = factory.Metadata().Table(tabID).Column(col).DatumType() - if typ.Family() == types.StringFamily { + switch typ.Family() { + case types.StringFamily: filterPlanner = &trigramFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } - } else { + case types.TSVectorFamily: + filterPlanner = &tsqueryFilterPlanner{ + tabID: tabID, + index: index, + computedColumns: computedColumns, + } + case types.JsonFamily, types.ArrayFamily: filterPlanner = &jsonOrArrayFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } + default: + return nil, nil, nil, nil, false } } diff --git a/pkg/sql/opt/invertedidx/tsearch.go b/pkg/sql/opt/invertedidx/tsearch.go new file mode 100644 index 000000000000..5e4dc2a7d728 --- /dev/null +++ b/pkg/sql/opt/invertedidx/tsearch.go @@ -0,0 +1,87 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/inverted" + "github.com/cockroachdb/cockroach/pkg/sql/opt" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr" + "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/errors" +) + +type tsqueryFilterPlanner struct { + tabID opt.TableID + index cat.Index + computedColumns map[opt.ColumnID]opt.ScalarExpr +} + +var _ invertedFilterPlanner = &tsqueryFilterPlanner{} + +// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner +// interface. +func (t *tsqueryFilterPlanner) extractInvertedFilterConditionFromLeaf( + _ context.Context, _ *eval.Context, expr opt.ScalarExpr, +) ( + invertedExpr inverted.Expression, + remainingFilters opt.ScalarExpr, + _ *invertedexpr.PreFiltererStateForInvertedFilterer, +) { + var constantVal opt.ScalarExpr + var left, right opt.ScalarExpr + switch e := expr.(type) { + case *memo.TSMatchesExpr: + left, right = e.Left, e.Right + default: + // Only the above types are supported. + return inverted.NonInvertedColExpression{}, expr, nil + } + if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) { + constantVal = right + } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) { + constantVal = left + } else { + // Can only accelerate with a single constant value. + return inverted.NonInvertedColExpression{}, expr, nil + } + d := memo.ExtractConstDatum(constantVal) + if d.ResolvedType() != types.TSQuery { + panic(errors.AssertionFailedf( + "trying to apply tsvector inverted index to unsupported type %s", d.ResolvedType(), + )) + } + q := d.(*tree.DTSQuery).TSQuery + var err error + invertedExpr, err = q.GetInvertedExpr() + if err != nil { + // An inverted expression could not be extracted. + return inverted.NonInvertedColExpression{}, expr, nil + } + + // If the extracted inverted expression is not tight then remaining filters + // must be applied after the inverted index scan. + // TODO(jordan): we could do better here by pruning terms that we successfully + // turn into inverted expressions during the tsquery tree walk. We'd need to + // implement a function that removes a term from a tsquery tree. + if !invertedExpr.IsTight() { + remainingFilters = expr + } + + // We do not currently support pre-filtering for text search indexes, so + // the returned pre-filter state is nil. + return invertedExpr, remainingFilters, nil +} diff --git a/pkg/sql/opt/invertedidx/tsearch_test.go b/pkg/sql/opt/invertedidx/tsearch_test.go new file mode 100644 index 000000000000..51aa8628fa3b --- /dev/null +++ b/pkg/sql/opt/invertedidx/tsearch_test.go @@ -0,0 +1,132 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx_test + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx" + "github.com/cockroachdb/cockroach/pkg/sql/opt/norm" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils/testcat" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/stretchr/testify/require" +) + +func TestTryFilterTSVector(t *testing.T) { + semaCtx := tree.MakeSemaContext() + st := cluster.MakeTestingClusterSettings() + evalCtx := eval.NewTestingEvalContext(st) + + tc := testcat.New() + if _, err := tc.ExecuteDDL( + "CREATE TABLE t (t tsvector, INVERTED INDEX (t))", + ); err != nil { + t.Fatal(err) + } + var f norm.Factory + f.Init(context.Background(), evalCtx, tc) + md := f.Metadata() + tn := tree.NewUnqualifiedTableName("t") + tab := md.AddTable(tc.Table(tn), tn) + tsvectorOrd := 1 + + // If we can create an inverted filter with the given filter expression and + // index, ok=true. If the spans in the resulting inverted index constraint + // do not have duplicate primary keys, unique=true. If the spans are tight, + // tight=true and remainingFilters="". Otherwise, tight is false and + // remainingFilters contains some or all of the original filters. + testCases := []struct { + filters string + ok bool + tight bool + unique bool + }{ + {filters: "t @@ 'a'", ok: true, tight: true, unique: true}, + {filters: "t @@ '!a'", ok: false, tight: false, unique: false}, + // Prefix match. + {filters: "t @@ 'a:*'", ok: true, tight: true, unique: false}, + // Weight match. + {filters: "t @@ 'a:C'", ok: true, tight: false, unique: true}, + // Weight and prefix match. + {filters: "t @@ 'a:C*'", ok: true, tight: false, unique: false}, + + {filters: "t @@ 'a | b'", ok: true, tight: true, unique: false}, + {filters: "t @@ 'a & b'", ok: true, tight: true, unique: true}, + {filters: "t @@ 'a <-> b'", ok: true, tight: false, unique: true}, + + // Can't filter with ! in an or clause. + {filters: "t @@ '!a | b'", ok: false, tight: false, unique: false}, + {filters: "t @@ 'a | !b'", ok: false, tight: false, unique: false}, + {filters: "t @@ '!a | !b'", ok: false, tight: false, unique: false}, + + // ! in an and clause is okay - we just re-filter on the ! term. + {filters: "t @@ 'a & !b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a & b'", ok: true, tight: false, unique: true}, + // But not if both are !. + {filters: "t @@ '!a & !b'", ok: false, tight: false, unique: false}, + + // Same as above, except <-> matches are never tight - they always require + // re-checking because the index doesn't store the lexeme positions. + {filters: "t @@ 'a <-> !b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a <-> b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a <-> !b'", ok: false, tight: false, unique: false}, + + // Some sanity checks for more than 2 terms, to make sure that the output + // de-uniqueifies as we travel up the tree with more than 1 lexeme seen. + {filters: "t @@ '(a & !b) | c'", ok: true, tight: false, unique: false}, + {filters: "t @@ '(a & b) | c'", ok: true, tight: true, unique: false}, + {filters: "t @@ '(a & b) <-> !(c | d)'", ok: true, tight: false, unique: true}, + } + + for _, tc := range testCases { + t.Logf("test case: %v", tc) + filters := testutils.BuildFilters(t, &f, &semaCtx, evalCtx, tc.filters) + + // We're not testing that the correct SpanExpression is returned here; + // that is tested elsewhere. This is just testing that we are constraining + // the index when we expect to and we have the correct values for tight, + // unique, and remainingFilters. + spanExpr, _, remainingFilters, _, ok := invertedidx.TryFilterInvertedIndex( + context.Background(), + evalCtx, + &f, + filters, + nil, /* optionalFilters */ + tab, + md.Table(tab).Index(tsvectorOrd), + nil, /* computedColumns */ + ) + if tc.ok != ok { + t.Fatalf("expected %v, got %v", tc.ok, ok) + } + if !ok { + continue + } + + if tc.tight != spanExpr.Tight { + t.Fatalf("For (%s), expected tight=%v, but got %v", tc.filters, tc.tight, spanExpr.Tight) + } + if tc.unique != spanExpr.Unique { + t.Fatalf("For (%s), expected unique=%v, but got %v", tc.filters, tc.unique, spanExpr.Unique) + } + + if tc.tight { + require.True(t, remainingFilters.IsTrue()) + } else { + require.Equal(t, filters.String(), remainingFilters.String(), + "mismatched remaining filters") + } + } +} diff --git a/pkg/sql/opt/xform/select_funcs.go b/pkg/sql/opt/xform/select_funcs.go index 0f625ca2ce25..e3de897cd584 100644 --- a/pkg/sql/opt/xform/select_funcs.go +++ b/pkg/sql/opt/xform/select_funcs.go @@ -933,6 +933,10 @@ func (c *CustomFuncs) GenerateInvertedIndexScans( // produce duplicate primary keys or requires at least one UNION or // INTERSECTION. In this case, we must scan both the primary key columns // and the inverted key column. + // The reason we also check !spanExpr.Unique here is that sometimes we + // eliminate the UNION operator in the tree, replacing it with a non-nil + // FactoredUnionSpans in the SpanExpression, and that case needs to be + // noticed and filtered. needInvertedFilter := !spanExpr.Unique || spanExpr.Operator != inverted.None newScanPrivate.Cols = pkCols.Copy() var invertedCol opt.ColumnID diff --git a/pkg/util/tsearch/BUILD.bazel b/pkg/util/tsearch/BUILD.bazel index beaf84a3e5b4..f2c99b8bb705 100644 --- a/pkg/util/tsearch/BUILD.bazel +++ b/pkg/util/tsearch/BUILD.bazel @@ -14,6 +14,8 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch", visibility = ["//visibility:public"], deps = [ + "//pkg/keysbase", + "//pkg/sql/inverted", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", @@ -32,6 +34,7 @@ go_test( args = ["-test.timeout=295s"], embed = [":tsearch"], deps = [ + "//pkg/sql/inverted", "//pkg/testutils/skip", "//pkg/util/randutil", "@com_github_jackc_pgx_v4//:pgx", diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go index 3772386811cc..fd1b9173621b 100644 --- a/pkg/util/tsearch/encoding.go +++ b/pkg/util/tsearch/encoding.go @@ -460,12 +460,18 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) { // index key for the terms in this tsvector. func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) { outKeys := make([][]byte, 0, len(vector)) + // Note that by construction, TSVector contains only unique terms, so we don't + // need to de-duplicate terms when constructing the inverted index keys. for i := range vector { - l := vector[i].lexeme - outKey := make([]byte, len(inKey), len(inKey)+len(l)) - copy(outKey, inKey) - newKey := encoding.EncodeStringAscending(outKey, l) + newKey := EncodeInvertedIndexKey(inKey, vector[i].lexeme) outKeys = append(outKeys, newKey) } return outKeys, nil } + +// EncodeInvertedIndexKey returns the inverted index key for the input lexeme. +func EncodeInvertedIndexKey(inKey []byte, lexeme string) []byte { + outKey := make([]byte, len(inKey), len(inKey)+len(lexeme)) + copy(outKey, inKey) + return encoding.EncodeStringAscending(outKey, lexeme) +} diff --git a/pkg/util/tsearch/encoding_test.go b/pkg/util/tsearch/encoding_test.go index 7d0bfe120144..083a6f94748f 100644 --- a/pkg/util/tsearch/encoding_test.go +++ b/pkg/util/tsearch/encoding_test.go @@ -13,6 +13,7 @@ package tsearch import ( "testing" + "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/stretchr/testify/assert" ) @@ -66,3 +67,144 @@ func TestRoundtripRandomTSQuery(t *testing.T) { assert.Equal(t, encoded, reEncoded) } } + +func TestEncodeTSQueryInvertedIndexSpans(t *testing.T) { + testCases := []struct { + vector string + query string + expected bool + tight bool + unique bool + }{ + // This test uses EncodeInvertedIndexKeys and + // GetInvertedExpr to determine whether the tsquery matches the tsvector. If + // the vector @@ query, expected is true. Otherwise expected is false. If + // the spans produced for contains are tight, tight is true. Otherwise tight + // is false. + // + // If GetInvertedExpr produces spans that are guaranteed not to + // contain duplicate primary keys, unique is true. Otherwise it is false. + {`a:2`, `a`, true, true, true}, + {`b:2`, `a`, false, true, true}, + + {`'foo'`, `'foo'`, true, true, true}, + + {`a:2`, `a & b`, false, true, true}, + {`a:1 b:2`, `a & b`, true, true, true}, + + {`a:2`, `a | b`, true, true, false}, + {`a:1 b:2`, `a | b`, true, true, false}, + {`c:1`, `a | b`, false, true, false}, + + {`a:1`, `a <-> b`, false, false, true}, + {`a:1 b:2`, `a <-> b`, true, false, true}, + {`a:1 b:3`, `a <-> b`, false, false, true}, + + {`a:1 b:2`, `a <-> (b|c)`, true, false, false}, + {`a:1 c:2`, `a <-> (b|c)`, true, false, false}, + {`a:1 d:2`, `a <-> (b|c)`, false, false, false}, + {`a:1 b:2`, `a <-> (!b|c)`, false, false, true}, + {`a:1 c:2`, `a <-> (!b|c)`, true, false, true}, + {`a:1 d:2`, `a <-> (!b|c)`, true, false, true}, + {`a:1 b:2`, `a <-> (b|!c)`, true, false, true}, + {`a:1 c:2`, `a <-> (b|!c)`, false, false, true}, + {`a:1 d:2`, `a <-> (b|!c)`, true, false, true}, + {`a:1 b:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 c:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 d:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 b:2 c:3 d:4`, `a <-> ((b <-> c) | d)`, true, false, false}, + {`a:1 b:2 c:3 d:4`, `a <-> (b | (c <-> d))`, true, false, false}, + } + + // runTest checks that evaluating `left @@ right` using keys from + // EncodeInvertedIndexKeys and spans from GetInvertedExpr + // produces the expected result. + // returns tight=true if the spans from GetInvertedExpr + // were tight, and tight=false otherwise. + runTest := func(left TSVector, right TSQuery, expected, expectUnique bool) (tight bool) { + keys, err := EncodeInvertedIndexKeys(nil, left) + assert.NoError(t, err) + + invertedExpr, err := right.GetInvertedExpr() + assert.NoError(t, err) + + spanExpr, ok := invertedExpr.(*inverted.SpanExpression) + assert.True(t, ok) + + if spanExpr.Unique != expectUnique { + t.Errorf("For %s, expected unique=%v, but got %v", right, expectUnique, spanExpr.Unique) + } + + actual, err := spanExpr.ContainsKeys(keys) + assert.NoError(t, err) + + // There may be some false positives, so filter those out. + if actual && !spanExpr.Tight { + actual, err = EvalTSQuery(right, left) + assert.NoError(t, err) + } + + if actual != expected { + if expected { + t.Errorf("expected %s to match %s but it did not", left.String(), right.String()) + } else { + t.Errorf("expected %s not to match %s but it did", left.String(), right.String()) + } + } + + return spanExpr.Tight + } + + // Run pre-defined test cases from above. + for _, c := range testCases { + indexedValue, err := ParseTSVector(c.vector) + assert.NoError(t, err) + query, err := ParseTSQuery(c.query) + assert.NoError(t, err) + + // First check that evaluating `indexedValue @@ query` matches the expected + // result. + res, err := EvalTSQuery(query, indexedValue) + assert.NoError(t, err) + if res != c.expected { + t.Fatalf( + "expected value of %s @@ %s did not match actual value. Expected: %v. Got: %v", + c.vector, c.query, c.expected, res, + ) + } + + // Now check that we get the same result with the inverted index spans. + tight := runTest(indexedValue, query, c.expected, c.unique) + + // And check that the tightness matches the expected value. + if tight != c.tight { + if c.tight { + t.Errorf("expected spans for %s to be tight but they were not", c.query) + } else { + t.Errorf("expected spans for %s not to be tight but they were", c.query) + } + } + } + + // Run a set of randomly generated test cases. + rng, _ := randutil.NewTestRand() + for i := 0; i < 100; i++ { + // Generate a random query and vector and evaluate the result of `left @@ right`. + query := RandomTSQuery(rng) + vector := RandomTSVector(rng) + + res, err := EvalTSQuery(query, vector) + assert.NoError(t, err) + + invertedExpr, err := query.GetInvertedExpr() + if err != nil { + // We can't generate an inverted expression for this query, so there's + // nothing to test here. + continue + } + expectedUnique := invertedExpr.(*inverted.SpanExpression).Unique + + // Now check that we get the same result with the inverted index spans. + runTest(vector, query, res, expectedUnique) + } +} diff --git a/pkg/util/tsearch/random.go b/pkg/util/tsearch/random.go index 02107f8f1dd5..9fc8124a06e8 100644 --- a/pkg/util/tsearch/random.go +++ b/pkg/util/tsearch/random.go @@ -52,13 +52,38 @@ func RandomTSVector(rng *rand.Rand) TSVector { // RandomTSQuery returns a random TSQuery for testing. func RandomTSQuery(rng *rand.Rand) TSQuery { - // TODO(jordan): make this a more robust random query generator + // TODO(jordan): add parenthesis grouping to the random query generator + nTerms := 1 + rng.Intn(5) for { - l := make([]byte, 1+rng.Intn(10)) - for i := range l { - l[i] = alphabet[rng.Intn(len(alphabet))] + var sb strings.Builder + for i := 0; i < nTerms; i++ { + l := make([]byte, 1+rng.Intn(10)) + for i := range l { + l[i] = alphabet[rng.Intn(len(alphabet))] + } + if rng.Intn(4) == 0 { + // Make it a negation query! + sb.WriteString("!") + } + sb.Write(l) + sb.WriteString(" ") + if i < nTerms-1 { + infixOp := rng.Intn(3) + var opstr string + switch infixOp { + case 0: + opstr = "&" + case 1: + opstr = "|" + case 2: + opstr = "<->" + } + sb.WriteString(opstr) + sb.WriteString(" ") + } } - query, err := ParseTSQuery(string(l)) + + query, err := ParseTSQuery(sb.String()) if err != nil { continue } diff --git a/pkg/util/tsearch/tsquery.go b/pkg/util/tsearch/tsquery.go index 296247af6583..8b8610b59384 100644 --- a/pkg/util/tsearch/tsquery.go +++ b/pkg/util/tsearch/tsquery.go @@ -14,6 +14,8 @@ import ( "fmt" "strings" + "github.com/cockroachdb/cockroach/pkg/keysbase" + "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/errors" @@ -169,6 +171,105 @@ func (q TSQuery) String() string { return q.root.String() } +// GetInvertedExpr returns the inverted expression that can be used to search +// an index. +func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) { + return q.root.getInvertedExpr() +} + +func (n *tsNode) getInvertedExpr() (inverted.Expression, error) { + switch n.op { + case invalid: + // We're looking at a lexeme match. + // There are 3 options: + // 1. Normal match. + // In this case, we make a tight and unique span. + // 2. Prefix match. + // In this case, we make a non-unique, tight span that starts with the + // prefix. + // 3. Weighted match. + // In this case, we make the match non-tight, because we don't store the + // weights of the lexemes in the index, and are forced to re-check + // once we get the result from the inverted index. + // Note that options 2 and 3 can both be present. + var weight tsWeight + if len(n.term.positions) > 0 { + weight = n.term.positions[0].weight + } + key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme) + var span inverted.Span + + prefixMatch := weight&weightStar != 0 + if prefixMatch { + span = inverted.Span{ + Start: key, + End: EncodeInvertedIndexKey(nil /* inKey */, string(keysbase.PrefixEnd([]byte(n.term.lexeme)))), + } + } else { + span = inverted.MakeSingleValSpan(key) + } + invertedExpr := inverted.ExprForSpan(span, true /* tight */) + if !prefixMatch { + // If we don't have a prefix match we also can set unique=true. + invertedExpr.Unique = true + } + + if weight != 0 && weight != weightStar { + // Some weights are set. + invertedExpr.SetNotTight() + } + return invertedExpr, nil + case followedby: + fallthrough + case and: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil && rErr != nil { + // We need a positive match on at least one side. + return nil, lErr + } else if lErr != nil { + // An error on one side means we have to re-check that side's condition + // later. + r.SetNotTight() + //nolint:returnerrcheck + return r, nil + } else if rErr != nil { + // Ditto above. + l.SetNotTight() + //nolint:returnerrcheck + return l, nil + } + expr := inverted.And(l, r) + if n.op == followedby { + // If we have a followedby match, we have to re-check the results of the + // match after we get them from the inverted index - just because both + // terms are present doesn't mean they're properly next to each other, + // and the index doesn't store position information at all. + expr.SetNotTight() + } + return expr, nil + case or: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil { + // We need a positive match on both sides, so we return an error here. + // For example, searching for a | !b would require a full scan, since some + // documents could match that contain neither a nor b. + return nil, lErr + } else if rErr != nil { + return nil, rErr + } + return inverted.Or(l, r), nil + case not: + // A not would require more advanced machinery than we have, so for now + // we'll just assume we can't perform an inverted expression search on a + // not. Note that a nested not would make it possible, but we are ignoring + // this case for now as it seems marginal. + return nil, errors.New("unable to create inverted expr for not") + } + return nil, errors.AssertionFailedf("invalid operator %d", n.op) +} + func lexTSQuery(input string) (TSVector, error) { parser := tsVectorLexer{ input: input,