From d07f41b46a95ed8c885c3ae61225f38e065fc77c Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Fri, 22 Jul 2022 23:35:06 -0400 Subject: [PATCH 1/2] sql: permit inverted indexes on tsvector This commit adds the ability to create an inverted index on a tsvector column. Release note: None --- docs/generated/sql/functions.md | 2 ++ pkg/sql/catalog/colinfo/col_type_info.go | 1 + pkg/sql/create_index.go | 6 ++++++ pkg/sql/rowenc/BUILD.bazel | 1 + pkg/sql/rowenc/encoded_datum.go | 2 +- pkg/sql/rowenc/index_encoding.go | 3 +++ pkg/sql/sem/builtins/builtins.go | 14 ++++++++++++++ pkg/sql/sem/builtins/fixed_oids.go | 1 + pkg/util/tsearch/encoding.go | 14 ++++++++++++++ 9 files changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index a626bc951097..cd172954169a 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -3197,6 +3197,8 @@ active for the current transaction.

Stable crdb_internal.num_inverted_index_entries(val: jsonb, version: int) → int

This function is used only by CockroachDB’s developers for testing purposes.

Stable +crdb_internal.num_inverted_index_entries(val: tsvector, version: int) → int

This function is used only by CockroachDB’s developers for testing purposes.

+
Stable crdb_internal.payloads_for_span(span_id: int) → tuple{string AS payload_type, jsonb AS payload_jsonb}

Returns the payload(s) of the requested span and all its children.

Volatile crdb_internal.payloads_for_trace(trace_id: int) → tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}

Returns the payload(s) of the requested trace.

diff --git a/pkg/sql/catalog/colinfo/col_type_info.go b/pkg/sql/catalog/colinfo/col_type_info.go index 92bc82ea9a16..98d21abdbc5e 100644 --- a/pkg/sql/catalog/colinfo/col_type_info.go +++ b/pkg/sql/catalog/colinfo/col_type_info.go @@ -165,6 +165,7 @@ func ColumnTypeIsOnlyInvertedIndexable(t *types.T) bool { case types.JsonFamily: case types.GeographyFamily: case types.GeometryFamily: + case types.TSVectorFamily: default: return false } diff --git a/pkg/sql/create_index.go b/pkg/sql/create_index.go index f7df6e09f826..42df6c594ccd 100644 --- a/pkg/sql/create_index.go +++ b/pkg/sql/create_index.go @@ -404,6 +404,12 @@ func populateInvertedIndexDescriptor( return newUndefinedOpclassError(invCol.OpClass) } indexDesc.InvertedColumnKinds[0] = catpb.InvertedIndexColumnKind_TRIGRAM + case types.TSVectorFamily: + switch invCol.OpClass { + case "tsvector_ops", "": + default: + return newUndefinedOpclassError(invCol.OpClass) + } default: return tabledesc.NewInvalidInvertedColumnError(column.GetName(), column.GetType().Name()) } diff --git a/pkg/sql/rowenc/BUILD.bazel b/pkg/sql/rowenc/BUILD.bazel index 1dca4c1a6bc9..0ba5b9748a94 100644 --- a/pkg/sql/rowenc/BUILD.bazel +++ b/pkg/sql/rowenc/BUILD.bazel @@ -39,6 +39,7 @@ go_library( "//pkg/util/mon", "//pkg/util/protoutil", "//pkg/util/trigram", + "//pkg/util/tsearch", "//pkg/util/unique", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_redact//:redact", diff --git a/pkg/sql/rowenc/encoded_datum.go b/pkg/sql/rowenc/encoded_datum.go index a68883c8dbe9..44386dfe1be3 100644 --- a/pkg/sql/rowenc/encoded_datum.go +++ b/pkg/sql/rowenc/encoded_datum.go @@ -324,7 +324,7 @@ func (ed *EncDatum) Fingerprint( var err error memUsageBefore := ed.Size() switch typ.Family() { - case types.JsonFamily: + case types.JsonFamily, types.TSVectorFamily: if err = ed.EnsureDecoded(typ, a); err != nil { return nil, err } diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go index eaf59f1fe03b..2478092f4f08 100644 --- a/pkg/sql/rowenc/index_encoding.go +++ b/pkg/sql/rowenc/index_encoding.go @@ -38,6 +38,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/trigram" + "github.com/cockroachdb/cockroach/pkg/util/tsearch" "github.com/cockroachdb/cockroach/pkg/util/unique" "github.com/cockroachdb/errors" ) @@ -605,6 +606,8 @@ func EncodeInvertedIndexTableKeys( // val could be a DOidWrapper, so we need to use the unwrapped datum // here. return encodeTrigramInvertedIndexTableKeys(string(*datum.(*tree.DString)), inKey, version, true /* pad */) + case types.TSVectorFamily: + return tsearch.EncodeInvertedIndexKeys(inKey, val.(*tree.DTSVector).TSVector) } return nil, errors.AssertionFailedf("trying to apply inverted index to unsupported type %s", datum.ResolvedType()) } diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 3eec3a410e2a..7cc613e81c46 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -5945,6 +5945,20 @@ value if you rely on the HLC for accuracy.`, Volatility: volatility.Stable, CalledOnNullInput: true, }, + tree.Overload{ + Types: tree.ParamTypes{ + {Name: "val", Typ: types.TSVector}, + {Name: "version", Typ: types.Int}, + }, + ReturnType: tree.FixedReturnType(types.Int), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + val := args[0].(*tree.DTSVector) + return tree.NewDInt(tree.DInt(len(val.TSVector))), nil + }, + Info: "This function is used only by CockroachDB's developers for testing purposes.", + Volatility: volatility.Stable, + CalledOnNullInput: true, + }, ), // Returns true iff the current user has admin role. diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index ef171a208244..9c6035efa79f 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2043,6 +2043,7 @@ var builtinOidsArray = []string{ 2067: `crdb_internal.gen_rand_ident(name_pattern: string, count: int) -> string`, 2068: `crdb_internal.gen_rand_ident(name_pattern: string, count: int, parameters: jsonb) -> string`, 2069: `crdb_internal.create_tenant(parameters: jsonb) -> int`, + 2070: `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go index 3f898ad0e68e..3772386811cc 100644 --- a/pkg/util/tsearch/encoding.go +++ b/pkg/util/tsearch/encoding.go @@ -455,3 +455,17 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) { } return b, ret, nil } + +// EncodeInvertedIndexKeys returns a slice of byte slices, one per inverted +// index key for the terms in this tsvector. +func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) { + outKeys := make([][]byte, 0, len(vector)) + for i := range vector { + l := vector[i].lexeme + outKey := make([]byte, len(inKey), len(inKey)+len(l)) + copy(outKey, inKey) + newKey := encoding.EncodeStringAscending(outKey, l) + outKeys = append(outKeys, newKey) + } + return outKeys, nil +} From 5dd8ae2675952655d665239ded06a1d4b37698b1 Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Sun, 24 Jul 2022 21:51:02 -0400 Subject: [PATCH 2/2] sql: inv idx accelerate tsvector@@tsquery queries This commit adds inverted index acceleration for expressions that evaluate a tsquery against a tsvector using the `@@` operator. Release note (sql change): it's now possible to run efficient tsvector @@ tsquery searches when there is an inverted index on the tsvector column being searched. --- pkg/sql/inverted/expression.go | 16 +- .../logictest/testdata/logic_test/tsvector | 96 ++++++++++ .../exec/execbuilder/testdata/tsvector_index | 171 ++++++++++++++++++ .../execbuilder/tests/local/generated_test.go | 7 + pkg/sql/opt/invertedidx/BUILD.bazel | 2 + .../opt/invertedidx/inverted_index_expr.go | 13 +- pkg/sql/opt/invertedidx/tsearch.go | 87 +++++++++ pkg/sql/opt/invertedidx/tsearch_test.go | 132 ++++++++++++++ pkg/sql/opt/xform/select_funcs.go | 4 + pkg/util/tsearch/BUILD.bazel | 3 + pkg/util/tsearch/encoding.go | 14 +- pkg/util/tsearch/encoding_test.go | 142 +++++++++++++++ pkg/util/tsearch/random.go | 35 +++- pkg/util/tsearch/tsquery.go | 101 +++++++++++ 14 files changed, 807 insertions(+), 16 deletions(-) create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/tsvector_index create mode 100644 pkg/sql/opt/invertedidx/tsearch.go create mode 100644 pkg/sql/opt/invertedidx/tsearch_test.go diff --git a/pkg/sql/inverted/expression.go b/pkg/sql/inverted/expression.go index 6d4ab4ac3e5d..8b83506b7c27 100644 --- a/pkg/sql/inverted/expression.go +++ b/pkg/sql/inverted/expression.go @@ -300,11 +300,17 @@ type SpanExpression struct { // Tight mirrors the definition of IsTight(). Tight bool - // Unique is true if the spans are guaranteed not to produce duplicate - // primary keys. Otherwise, Unique is false. Unique may be true for certain - // JSON or Array SpanExpressions, and it holds when unique SpanExpressions - // are combined with And. It does not hold when these SpanExpressions are - // combined with Or. + // Unique is true if the spans in FactoredUnionSpans are guaranteed not to + // produce duplicate primary keys. Otherwise, Unique is false. Unique may + // be true for certain JSON or Array SpanExpressions, and it holds when + // unique SpanExpressions are combined with And. It does not hold when + // these SpanExpressions are combined with Or. + // + // Once a SpanExpression is built, this field is relevant if the root + // SpanExpression has no children (i.e., Operator is None). In this case, + // Unique is used to determine whether an invertedFilter is needed on top + // of the inverted index scan to deduplicate keys (an invertedFilter is + // always necessary if Operator is not None). Unique bool // SpansToRead are the spans to read from the inverted index diff --git a/pkg/sql/logictest/testdata/logic_test/tsvector b/pkg/sql/logictest/testdata/logic_test/tsvector index 71398395526b..65ffb7d1428c 100644 --- a/pkg/sql/logictest/testdata/logic_test/tsvector +++ b/pkg/sql/logictest/testdata/logic_test/tsvector @@ -139,3 +139,99 @@ query T VALUES ( json_build_array($$'cat' & 'rat'$$:::TSQUERY)::JSONB) ---- ["'cat' & 'rat'"] + +# Test tsvector inverted indexes. +statement ok +DROP TABLE a; +CREATE TABLE a ( + a TSVECTOR, + b TSQUERY, + INVERTED INDEX(a) +); +INSERT INTO a VALUES('foo:3 bar:4,5'), ('baz:1'), ('foo:3'), ('bar:2') + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo' +---- +'bar':4,5 'foo':3 +'foo':3 + +statement error index \"a_a_idx\" is inverted and cannot be used for this query +SELECT a FROM a@a_a_idx WHERE a @@ '!foo' + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo' OR a @@ 'bar' +---- +'bar':4,5 'foo':3 +'foo':3 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' +---- +'bar':4,5 'foo':3 +'foo':3 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' OR a @@ 'baz' +---- +'bar':4,5 'foo':3 +'baz':1 +'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo & bar' +---- +'bar':4,5 'foo':3 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> bar' +---- +'bar':4,5 'foo':3 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'bar <-> foo' +---- + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> !bar' +---- +'foo':3 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ '!baz <-> bar' +---- +'bar':4,5 'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'foo & !bar' +---- +'foo':3 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:*' +---- +'bar':4,5 'foo':3 +'baz':1 +'bar':2 + +query T rowsort +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* | foo' +---- +'bar':4,5 'foo':3 +'baz':1 +'foo':3 +'bar':2 + +query T +SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* & foo' +---- +'bar':4,5 'foo':3 + +# Test that tsvector indexes can't accelerate the @@ operator with no constant +# columns. +statement error index \"a_a_idx\" is inverted and cannot be used for this query +EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b diff --git a/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index new file mode 100644 index 000000000000..ee8b0bd9c576 --- /dev/null +++ b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index @@ -0,0 +1,171 @@ +# LogicTest: local + +statement ok +CREATE TABLE a ( + a INT PRIMARY KEY, + b TSVECTOR, + c TSQUERY, + FAMILY (a,b,c), + INVERTED INDEX(b) +) + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 3 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 3 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar' +---- +distribution: local +vectorized: true +· +• lookup join +│ table: a@a_pkey +│ equality: (a) = (a) +│ equality cols are key +│ pred: b @@ '''foo'' & ''bar''' +│ +└── • zigzag join + left table: a@a_b_idx + left columns: (a, b_inverted_key) + left fixed values: 1 column + right table: a@a_b_idx + right columns: (a, b_inverted_key) + right fixed values: 1 column + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar' +---- +distribution: local +vectorized: true +· +• lookup join +│ table: a@a_pkey +│ equality: (a) = (a) +│ equality cols are key +│ pred: b @@ '''foo'' <-> ''bar''' +│ +└── • zigzag join + left table: a@a_b_idx + left columns: (a, b_inverted_key) + left fixed values: 1 column + right table: a@a_b_idx + right columns: (a, b_inverted_key) + right fixed values: 1 column + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar' +---- +distribution: local +vectorized: true +· +• filter +│ filter: b @@ '''foo'' & !''bar''' +│ +└── • index join + │ table: a@a_pkey + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + + +query T +EXPLAIN SELECT a FROM a@a_b_idx WHERE b @@ 'ba:*' +---- +distribution: local +vectorized: true +· +• inverted filter +│ inverted column: b_inverted_key +│ num spans: 1 +│ +└── • scan + missing stats + table: a@a_b_idx + spans: 1 span + + +# Test that tsvector indexes can't accelerate the @@ operator with no constant +# columns. +statement error index \"a_b_idx\" is inverted and cannot be used for this query +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go index ffac05a8f9f0..a2832beb63b7 100644 --- a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go +++ b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go @@ -550,6 +550,13 @@ func TestExecBuild_trigram_index( runExecBuildLogicTest(t, "trigram_index") } +func TestExecBuild_tsvector_index( + t *testing.T, +) { + defer leaktest.AfterTest(t)() + runExecBuildLogicTest(t, "tsvector_index") +} + func TestExecBuild_tuple( t *testing.T, ) { diff --git a/pkg/sql/opt/invertedidx/BUILD.bazel b/pkg/sql/opt/invertedidx/BUILD.bazel index 8730f991ec61..7fa01ba26944 100644 --- a/pkg/sql/opt/invertedidx/BUILD.bazel +++ b/pkg/sql/opt/invertedidx/BUILD.bazel @@ -8,6 +8,7 @@ go_library( "inverted_index_expr.go", "json_array.go", "trigram.go", + "tsearch.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx", visibility = ["//visibility:public"], @@ -49,6 +50,7 @@ go_test( "geo_test.go", "json_array_test.go", "trigram_test.go", + "tsearch_test.go", ], args = ["-test.timeout=55s"], deps = [ diff --git a/pkg/sql/opt/invertedidx/inverted_index_expr.go b/pkg/sql/opt/invertedidx/inverted_index_expr.go index eec56289d01b..6db743491e87 100644 --- a/pkg/sql/opt/invertedidx/inverted_index_expr.go +++ b/pkg/sql/opt/invertedidx/inverted_index_expr.go @@ -116,18 +116,27 @@ func TryFilterInvertedIndex( } else { col := index.InvertedColumn().InvertedSourceColumnOrdinal() typ = factory.Metadata().Table(tabID).Column(col).DatumType() - if typ.Family() == types.StringFamily { + switch typ.Family() { + case types.StringFamily: filterPlanner = &trigramFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } - } else { + case types.TSVectorFamily: + filterPlanner = &tsqueryFilterPlanner{ + tabID: tabID, + index: index, + computedColumns: computedColumns, + } + case types.JsonFamily, types.ArrayFamily: filterPlanner = &jsonOrArrayFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } + default: + return nil, nil, nil, nil, false } } diff --git a/pkg/sql/opt/invertedidx/tsearch.go b/pkg/sql/opt/invertedidx/tsearch.go new file mode 100644 index 000000000000..5e4dc2a7d728 --- /dev/null +++ b/pkg/sql/opt/invertedidx/tsearch.go @@ -0,0 +1,87 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/inverted" + "github.com/cockroachdb/cockroach/pkg/sql/opt" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr" + "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/errors" +) + +type tsqueryFilterPlanner struct { + tabID opt.TableID + index cat.Index + computedColumns map[opt.ColumnID]opt.ScalarExpr +} + +var _ invertedFilterPlanner = &tsqueryFilterPlanner{} + +// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner +// interface. +func (t *tsqueryFilterPlanner) extractInvertedFilterConditionFromLeaf( + _ context.Context, _ *eval.Context, expr opt.ScalarExpr, +) ( + invertedExpr inverted.Expression, + remainingFilters opt.ScalarExpr, + _ *invertedexpr.PreFiltererStateForInvertedFilterer, +) { + var constantVal opt.ScalarExpr + var left, right opt.ScalarExpr + switch e := expr.(type) { + case *memo.TSMatchesExpr: + left, right = e.Left, e.Right + default: + // Only the above types are supported. + return inverted.NonInvertedColExpression{}, expr, nil + } + if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) { + constantVal = right + } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) { + constantVal = left + } else { + // Can only accelerate with a single constant value. + return inverted.NonInvertedColExpression{}, expr, nil + } + d := memo.ExtractConstDatum(constantVal) + if d.ResolvedType() != types.TSQuery { + panic(errors.AssertionFailedf( + "trying to apply tsvector inverted index to unsupported type %s", d.ResolvedType(), + )) + } + q := d.(*tree.DTSQuery).TSQuery + var err error + invertedExpr, err = q.GetInvertedExpr() + if err != nil { + // An inverted expression could not be extracted. + return inverted.NonInvertedColExpression{}, expr, nil + } + + // If the extracted inverted expression is not tight then remaining filters + // must be applied after the inverted index scan. + // TODO(jordan): we could do better here by pruning terms that we successfully + // turn into inverted expressions during the tsquery tree walk. We'd need to + // implement a function that removes a term from a tsquery tree. + if !invertedExpr.IsTight() { + remainingFilters = expr + } + + // We do not currently support pre-filtering for text search indexes, so + // the returned pre-filter state is nil. + return invertedExpr, remainingFilters, nil +} diff --git a/pkg/sql/opt/invertedidx/tsearch_test.go b/pkg/sql/opt/invertedidx/tsearch_test.go new file mode 100644 index 000000000000..51aa8628fa3b --- /dev/null +++ b/pkg/sql/opt/invertedidx/tsearch_test.go @@ -0,0 +1,132 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx_test + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx" + "github.com/cockroachdb/cockroach/pkg/sql/opt/norm" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils" + "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils/testcat" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/stretchr/testify/require" +) + +func TestTryFilterTSVector(t *testing.T) { + semaCtx := tree.MakeSemaContext() + st := cluster.MakeTestingClusterSettings() + evalCtx := eval.NewTestingEvalContext(st) + + tc := testcat.New() + if _, err := tc.ExecuteDDL( + "CREATE TABLE t (t tsvector, INVERTED INDEX (t))", + ); err != nil { + t.Fatal(err) + } + var f norm.Factory + f.Init(context.Background(), evalCtx, tc) + md := f.Metadata() + tn := tree.NewUnqualifiedTableName("t") + tab := md.AddTable(tc.Table(tn), tn) + tsvectorOrd := 1 + + // If we can create an inverted filter with the given filter expression and + // index, ok=true. If the spans in the resulting inverted index constraint + // do not have duplicate primary keys, unique=true. If the spans are tight, + // tight=true and remainingFilters="". Otherwise, tight is false and + // remainingFilters contains some or all of the original filters. + testCases := []struct { + filters string + ok bool + tight bool + unique bool + }{ + {filters: "t @@ 'a'", ok: true, tight: true, unique: true}, + {filters: "t @@ '!a'", ok: false, tight: false, unique: false}, + // Prefix match. + {filters: "t @@ 'a:*'", ok: true, tight: true, unique: false}, + // Weight match. + {filters: "t @@ 'a:C'", ok: true, tight: false, unique: true}, + // Weight and prefix match. + {filters: "t @@ 'a:C*'", ok: true, tight: false, unique: false}, + + {filters: "t @@ 'a | b'", ok: true, tight: true, unique: false}, + {filters: "t @@ 'a & b'", ok: true, tight: true, unique: true}, + {filters: "t @@ 'a <-> b'", ok: true, tight: false, unique: true}, + + // Can't filter with ! in an or clause. + {filters: "t @@ '!a | b'", ok: false, tight: false, unique: false}, + {filters: "t @@ 'a | !b'", ok: false, tight: false, unique: false}, + {filters: "t @@ '!a | !b'", ok: false, tight: false, unique: false}, + + // ! in an and clause is okay - we just re-filter on the ! term. + {filters: "t @@ 'a & !b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a & b'", ok: true, tight: false, unique: true}, + // But not if both are !. + {filters: "t @@ '!a & !b'", ok: false, tight: false, unique: false}, + + // Same as above, except <-> matches are never tight - they always require + // re-checking because the index doesn't store the lexeme positions. + {filters: "t @@ 'a <-> !b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a <-> b'", ok: true, tight: false, unique: true}, + {filters: "t @@ '!a <-> !b'", ok: false, tight: false, unique: false}, + + // Some sanity checks for more than 2 terms, to make sure that the output + // de-uniqueifies as we travel up the tree with more than 1 lexeme seen. + {filters: "t @@ '(a & !b) | c'", ok: true, tight: false, unique: false}, + {filters: "t @@ '(a & b) | c'", ok: true, tight: true, unique: false}, + {filters: "t @@ '(a & b) <-> !(c | d)'", ok: true, tight: false, unique: true}, + } + + for _, tc := range testCases { + t.Logf("test case: %v", tc) + filters := testutils.BuildFilters(t, &f, &semaCtx, evalCtx, tc.filters) + + // We're not testing that the correct SpanExpression is returned here; + // that is tested elsewhere. This is just testing that we are constraining + // the index when we expect to and we have the correct values for tight, + // unique, and remainingFilters. + spanExpr, _, remainingFilters, _, ok := invertedidx.TryFilterInvertedIndex( + context.Background(), + evalCtx, + &f, + filters, + nil, /* optionalFilters */ + tab, + md.Table(tab).Index(tsvectorOrd), + nil, /* computedColumns */ + ) + if tc.ok != ok { + t.Fatalf("expected %v, got %v", tc.ok, ok) + } + if !ok { + continue + } + + if tc.tight != spanExpr.Tight { + t.Fatalf("For (%s), expected tight=%v, but got %v", tc.filters, tc.tight, spanExpr.Tight) + } + if tc.unique != spanExpr.Unique { + t.Fatalf("For (%s), expected unique=%v, but got %v", tc.filters, tc.unique, spanExpr.Unique) + } + + if tc.tight { + require.True(t, remainingFilters.IsTrue()) + } else { + require.Equal(t, filters.String(), remainingFilters.String(), + "mismatched remaining filters") + } + } +} diff --git a/pkg/sql/opt/xform/select_funcs.go b/pkg/sql/opt/xform/select_funcs.go index 0f625ca2ce25..e3de897cd584 100644 --- a/pkg/sql/opt/xform/select_funcs.go +++ b/pkg/sql/opt/xform/select_funcs.go @@ -933,6 +933,10 @@ func (c *CustomFuncs) GenerateInvertedIndexScans( // produce duplicate primary keys or requires at least one UNION or // INTERSECTION. In this case, we must scan both the primary key columns // and the inverted key column. + // The reason we also check !spanExpr.Unique here is that sometimes we + // eliminate the UNION operator in the tree, replacing it with a non-nil + // FactoredUnionSpans in the SpanExpression, and that case needs to be + // noticed and filtered. needInvertedFilter := !spanExpr.Unique || spanExpr.Operator != inverted.None newScanPrivate.Cols = pkCols.Copy() var invertedCol opt.ColumnID diff --git a/pkg/util/tsearch/BUILD.bazel b/pkg/util/tsearch/BUILD.bazel index beaf84a3e5b4..f2c99b8bb705 100644 --- a/pkg/util/tsearch/BUILD.bazel +++ b/pkg/util/tsearch/BUILD.bazel @@ -14,6 +14,8 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch", visibility = ["//visibility:public"], deps = [ + "//pkg/keysbase", + "//pkg/sql/inverted", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", @@ -32,6 +34,7 @@ go_test( args = ["-test.timeout=295s"], embed = [":tsearch"], deps = [ + "//pkg/sql/inverted", "//pkg/testutils/skip", "//pkg/util/randutil", "@com_github_jackc_pgx_v4//:pgx", diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go index 3772386811cc..fd1b9173621b 100644 --- a/pkg/util/tsearch/encoding.go +++ b/pkg/util/tsearch/encoding.go @@ -460,12 +460,18 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) { // index key for the terms in this tsvector. func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) { outKeys := make([][]byte, 0, len(vector)) + // Note that by construction, TSVector contains only unique terms, so we don't + // need to de-duplicate terms when constructing the inverted index keys. for i := range vector { - l := vector[i].lexeme - outKey := make([]byte, len(inKey), len(inKey)+len(l)) - copy(outKey, inKey) - newKey := encoding.EncodeStringAscending(outKey, l) + newKey := EncodeInvertedIndexKey(inKey, vector[i].lexeme) outKeys = append(outKeys, newKey) } return outKeys, nil } + +// EncodeInvertedIndexKey returns the inverted index key for the input lexeme. +func EncodeInvertedIndexKey(inKey []byte, lexeme string) []byte { + outKey := make([]byte, len(inKey), len(inKey)+len(lexeme)) + copy(outKey, inKey) + return encoding.EncodeStringAscending(outKey, lexeme) +} diff --git a/pkg/util/tsearch/encoding_test.go b/pkg/util/tsearch/encoding_test.go index 7d0bfe120144..083a6f94748f 100644 --- a/pkg/util/tsearch/encoding_test.go +++ b/pkg/util/tsearch/encoding_test.go @@ -13,6 +13,7 @@ package tsearch import ( "testing" + "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/stretchr/testify/assert" ) @@ -66,3 +67,144 @@ func TestRoundtripRandomTSQuery(t *testing.T) { assert.Equal(t, encoded, reEncoded) } } + +func TestEncodeTSQueryInvertedIndexSpans(t *testing.T) { + testCases := []struct { + vector string + query string + expected bool + tight bool + unique bool + }{ + // This test uses EncodeInvertedIndexKeys and + // GetInvertedExpr to determine whether the tsquery matches the tsvector. If + // the vector @@ query, expected is true. Otherwise expected is false. If + // the spans produced for contains are tight, tight is true. Otherwise tight + // is false. + // + // If GetInvertedExpr produces spans that are guaranteed not to + // contain duplicate primary keys, unique is true. Otherwise it is false. + {`a:2`, `a`, true, true, true}, + {`b:2`, `a`, false, true, true}, + + {`'foo'`, `'foo'`, true, true, true}, + + {`a:2`, `a & b`, false, true, true}, + {`a:1 b:2`, `a & b`, true, true, true}, + + {`a:2`, `a | b`, true, true, false}, + {`a:1 b:2`, `a | b`, true, true, false}, + {`c:1`, `a | b`, false, true, false}, + + {`a:1`, `a <-> b`, false, false, true}, + {`a:1 b:2`, `a <-> b`, true, false, true}, + {`a:1 b:3`, `a <-> b`, false, false, true}, + + {`a:1 b:2`, `a <-> (b|c)`, true, false, false}, + {`a:1 c:2`, `a <-> (b|c)`, true, false, false}, + {`a:1 d:2`, `a <-> (b|c)`, false, false, false}, + {`a:1 b:2`, `a <-> (!b|c)`, false, false, true}, + {`a:1 c:2`, `a <-> (!b|c)`, true, false, true}, + {`a:1 d:2`, `a <-> (!b|c)`, true, false, true}, + {`a:1 b:2`, `a <-> (b|!c)`, true, false, true}, + {`a:1 c:2`, `a <-> (b|!c)`, false, false, true}, + {`a:1 d:2`, `a <-> (b|!c)`, true, false, true}, + {`a:1 b:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 c:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 d:2`, `a <-> (!b|!c)`, true, false, true}, + {`a:1 b:2 c:3 d:4`, `a <-> ((b <-> c) | d)`, true, false, false}, + {`a:1 b:2 c:3 d:4`, `a <-> (b | (c <-> d))`, true, false, false}, + } + + // runTest checks that evaluating `left @@ right` using keys from + // EncodeInvertedIndexKeys and spans from GetInvertedExpr + // produces the expected result. + // returns tight=true if the spans from GetInvertedExpr + // were tight, and tight=false otherwise. + runTest := func(left TSVector, right TSQuery, expected, expectUnique bool) (tight bool) { + keys, err := EncodeInvertedIndexKeys(nil, left) + assert.NoError(t, err) + + invertedExpr, err := right.GetInvertedExpr() + assert.NoError(t, err) + + spanExpr, ok := invertedExpr.(*inverted.SpanExpression) + assert.True(t, ok) + + if spanExpr.Unique != expectUnique { + t.Errorf("For %s, expected unique=%v, but got %v", right, expectUnique, spanExpr.Unique) + } + + actual, err := spanExpr.ContainsKeys(keys) + assert.NoError(t, err) + + // There may be some false positives, so filter those out. + if actual && !spanExpr.Tight { + actual, err = EvalTSQuery(right, left) + assert.NoError(t, err) + } + + if actual != expected { + if expected { + t.Errorf("expected %s to match %s but it did not", left.String(), right.String()) + } else { + t.Errorf("expected %s not to match %s but it did", left.String(), right.String()) + } + } + + return spanExpr.Tight + } + + // Run pre-defined test cases from above. + for _, c := range testCases { + indexedValue, err := ParseTSVector(c.vector) + assert.NoError(t, err) + query, err := ParseTSQuery(c.query) + assert.NoError(t, err) + + // First check that evaluating `indexedValue @@ query` matches the expected + // result. + res, err := EvalTSQuery(query, indexedValue) + assert.NoError(t, err) + if res != c.expected { + t.Fatalf( + "expected value of %s @@ %s did not match actual value. Expected: %v. Got: %v", + c.vector, c.query, c.expected, res, + ) + } + + // Now check that we get the same result with the inverted index spans. + tight := runTest(indexedValue, query, c.expected, c.unique) + + // And check that the tightness matches the expected value. + if tight != c.tight { + if c.tight { + t.Errorf("expected spans for %s to be tight but they were not", c.query) + } else { + t.Errorf("expected spans for %s not to be tight but they were", c.query) + } + } + } + + // Run a set of randomly generated test cases. + rng, _ := randutil.NewTestRand() + for i := 0; i < 100; i++ { + // Generate a random query and vector and evaluate the result of `left @@ right`. + query := RandomTSQuery(rng) + vector := RandomTSVector(rng) + + res, err := EvalTSQuery(query, vector) + assert.NoError(t, err) + + invertedExpr, err := query.GetInvertedExpr() + if err != nil { + // We can't generate an inverted expression for this query, so there's + // nothing to test here. + continue + } + expectedUnique := invertedExpr.(*inverted.SpanExpression).Unique + + // Now check that we get the same result with the inverted index spans. + runTest(vector, query, res, expectedUnique) + } +} diff --git a/pkg/util/tsearch/random.go b/pkg/util/tsearch/random.go index 02107f8f1dd5..9fc8124a06e8 100644 --- a/pkg/util/tsearch/random.go +++ b/pkg/util/tsearch/random.go @@ -52,13 +52,38 @@ func RandomTSVector(rng *rand.Rand) TSVector { // RandomTSQuery returns a random TSQuery for testing. func RandomTSQuery(rng *rand.Rand) TSQuery { - // TODO(jordan): make this a more robust random query generator + // TODO(jordan): add parenthesis grouping to the random query generator + nTerms := 1 + rng.Intn(5) for { - l := make([]byte, 1+rng.Intn(10)) - for i := range l { - l[i] = alphabet[rng.Intn(len(alphabet))] + var sb strings.Builder + for i := 0; i < nTerms; i++ { + l := make([]byte, 1+rng.Intn(10)) + for i := range l { + l[i] = alphabet[rng.Intn(len(alphabet))] + } + if rng.Intn(4) == 0 { + // Make it a negation query! + sb.WriteString("!") + } + sb.Write(l) + sb.WriteString(" ") + if i < nTerms-1 { + infixOp := rng.Intn(3) + var opstr string + switch infixOp { + case 0: + opstr = "&" + case 1: + opstr = "|" + case 2: + opstr = "<->" + } + sb.WriteString(opstr) + sb.WriteString(" ") + } } - query, err := ParseTSQuery(string(l)) + + query, err := ParseTSQuery(sb.String()) if err != nil { continue } diff --git a/pkg/util/tsearch/tsquery.go b/pkg/util/tsearch/tsquery.go index 296247af6583..8b8610b59384 100644 --- a/pkg/util/tsearch/tsquery.go +++ b/pkg/util/tsearch/tsquery.go @@ -14,6 +14,8 @@ import ( "fmt" "strings" + "github.com/cockroachdb/cockroach/pkg/keysbase" + "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/errors" @@ -169,6 +171,105 @@ func (q TSQuery) String() string { return q.root.String() } +// GetInvertedExpr returns the inverted expression that can be used to search +// an index. +func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) { + return q.root.getInvertedExpr() +} + +func (n *tsNode) getInvertedExpr() (inverted.Expression, error) { + switch n.op { + case invalid: + // We're looking at a lexeme match. + // There are 3 options: + // 1. Normal match. + // In this case, we make a tight and unique span. + // 2. Prefix match. + // In this case, we make a non-unique, tight span that starts with the + // prefix. + // 3. Weighted match. + // In this case, we make the match non-tight, because we don't store the + // weights of the lexemes in the index, and are forced to re-check + // once we get the result from the inverted index. + // Note that options 2 and 3 can both be present. + var weight tsWeight + if len(n.term.positions) > 0 { + weight = n.term.positions[0].weight + } + key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme) + var span inverted.Span + + prefixMatch := weight&weightStar != 0 + if prefixMatch { + span = inverted.Span{ + Start: key, + End: EncodeInvertedIndexKey(nil /* inKey */, string(keysbase.PrefixEnd([]byte(n.term.lexeme)))), + } + } else { + span = inverted.MakeSingleValSpan(key) + } + invertedExpr := inverted.ExprForSpan(span, true /* tight */) + if !prefixMatch { + // If we don't have a prefix match we also can set unique=true. + invertedExpr.Unique = true + } + + if weight != 0 && weight != weightStar { + // Some weights are set. + invertedExpr.SetNotTight() + } + return invertedExpr, nil + case followedby: + fallthrough + case and: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil && rErr != nil { + // We need a positive match on at least one side. + return nil, lErr + } else if lErr != nil { + // An error on one side means we have to re-check that side's condition + // later. + r.SetNotTight() + //nolint:returnerrcheck + return r, nil + } else if rErr != nil { + // Ditto above. + l.SetNotTight() + //nolint:returnerrcheck + return l, nil + } + expr := inverted.And(l, r) + if n.op == followedby { + // If we have a followedby match, we have to re-check the results of the + // match after we get them from the inverted index - just because both + // terms are present doesn't mean they're properly next to each other, + // and the index doesn't store position information at all. + expr.SetNotTight() + } + return expr, nil + case or: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil { + // We need a positive match on both sides, so we return an error here. + // For example, searching for a | !b would require a full scan, since some + // documents could match that contain neither a nor b. + return nil, lErr + } else if rErr != nil { + return nil, rErr + } + return inverted.Or(l, r), nil + case not: + // A not would require more advanced machinery than we have, so for now + // we'll just assume we can't perform an inverted expression search on a + // not. Note that a nested not would make it possible, but we are ignoring + // this case for now as it seems marginal. + return nil, errors.New("unable to create inverted expr for not") + } + return nil, errors.AssertionFailedf("invalid operator %d", n.op) +} + func lexTSQuery(input string) (TSVector, error) { parser := tsVectorLexer{ input: input,