From 50871a3a302675ebfe25486b58cc1118c811462b Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Sun, 24 Jul 2022 21:51:02 -0400 Subject: [PATCH] sql: inv idx accelerate tsvector@@tsquery queries This commit adds inverted index acceleration for expressions that evaluate a tsquery against a tsvector using the `@@` operator. Release note (sql change): it's now possible to run efficient tsvector @@ tsquery searches when there is an inverted index on the tsvector column being searched. --- docs/generated/sql/functions.md | 2 + .../exec/execbuilder/testdata/tsvector_index | 159 ++++++++++++++++++ .../execbuilder/tests/local/generated_test.go | 7 + pkg/sql/opt/invertedidx/BUILD.bazel | 1 + .../opt/invertedidx/inverted_index_expr.go | 13 +- pkg/sql/opt/invertedidx/tsearch.go | 84 +++++++++ pkg/sql/sem/builtins/builtins.go | 4 +- pkg/sql/sem/builtins/fixed_oids.go | 2 +- pkg/util/tsearch/BUILD.bazel | 1 + pkg/util/tsearch/encoding.go | 12 +- pkg/util/tsearch/tsquery.go | 49 ++++++ 11 files changed, 325 insertions(+), 9 deletions(-) create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/tsvector_index create mode 100644 pkg/sql/opt/invertedidx/tsearch.go diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 0f39d4b2e180..a1a518796636 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -3119,6 +3119,8 @@ active for the current transaction.

Stable crdb_internal.num_inverted_index_entries(val: jsonb, version: int) → int

This function is used only by CockroachDB’s developers for testing purposes.

Stable +crdb_internal.num_inverted_index_entries(val: tsvector, version: int) → int

This function is used only by CockroachDB’s developers for testing purposes.

+
Stable crdb_internal.payloads_for_span(span_id: int) → tuple{string AS payload_type, jsonb AS payload_jsonb}

Returns the payload(s) of the requested span and all its children.

Volatile crdb_internal.payloads_for_trace(trace_id: int) → tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}

Returns the payload(s) of the requested trace.

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index new file mode 100644 index 000000000000..d29f6b4cdbc6 --- /dev/null +++ b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index @@ -0,0 +1,159 @@ +# LogicTest: local + +statement ok +CREATE TABLE a ( + a INT PRIMARY KEY, + b TSVECTOR, + c TSQUERY, + FAMILY (a,b,c), + INVERTED INDEX(b) +) + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 1 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 1 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 3 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 3 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 2 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 2 spans + +query T +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar' +---- +distribution: local +vectorized: true +· +• index join +│ table: a@a_pkey +│ +└── • inverted filter + │ inverted column: b_inverted_key + │ num spans: 1 + │ + └── • scan + missing stats + table: a@a_b_idx + spans: 1 span + +# Test that tsvector indexes can't accelerate the @@ operator with no constant +# columns. +statement error index \"a_b_idx\" is inverted and cannot be used for this query +EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go index cf359e78f647..93d3009b4832 100644 --- a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go +++ b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go @@ -536,6 +536,13 @@ func TestExecBuild_trigram_index( runExecBuildLogicTest(t, "trigram_index") } +func TestExecBuild_tsvector_index( + t *testing.T, +) { + defer leaktest.AfterTest(t)() + runExecBuildLogicTest(t, "tsvector_index") +} + func TestExecBuild_tuple( t *testing.T, ) { diff --git a/pkg/sql/opt/invertedidx/BUILD.bazel b/pkg/sql/opt/invertedidx/BUILD.bazel index 8730f991ec61..5ca9cee91e08 100644 --- a/pkg/sql/opt/invertedidx/BUILD.bazel +++ b/pkg/sql/opt/invertedidx/BUILD.bazel @@ -8,6 +8,7 @@ go_library( "inverted_index_expr.go", "json_array.go", "trigram.go", + "tsearch.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx", visibility = ["//visibility:public"], diff --git a/pkg/sql/opt/invertedidx/inverted_index_expr.go b/pkg/sql/opt/invertedidx/inverted_index_expr.go index eec56289d01b..6db743491e87 100644 --- a/pkg/sql/opt/invertedidx/inverted_index_expr.go +++ b/pkg/sql/opt/invertedidx/inverted_index_expr.go @@ -116,18 +116,27 @@ func TryFilterInvertedIndex( } else { col := index.InvertedColumn().InvertedSourceColumnOrdinal() typ = factory.Metadata().Table(tabID).Column(col).DatumType() - if typ.Family() == types.StringFamily { + switch typ.Family() { + case types.StringFamily: filterPlanner = &trigramFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } - } else { + case types.TSVectorFamily: + filterPlanner = &tsqueryFilterPlanner{ + tabID: tabID, + index: index, + computedColumns: computedColumns, + } + case types.JsonFamily, types.ArrayFamily: filterPlanner = &jsonOrArrayFilterPlanner{ tabID: tabID, index: index, computedColumns: computedColumns, } + default: + return nil, nil, nil, nil, false } } diff --git a/pkg/sql/opt/invertedidx/tsearch.go b/pkg/sql/opt/invertedidx/tsearch.go new file mode 100644 index 000000000000..fbdb86b2b242 --- /dev/null +++ b/pkg/sql/opt/invertedidx/tsearch.go @@ -0,0 +1,84 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package invertedidx + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/inverted" + "github.com/cockroachdb/cockroach/pkg/sql/opt" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" + "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr" + "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/errors" +) + +type tsqueryFilterPlanner struct { + tabID opt.TableID + index cat.Index + computedColumns map[opt.ColumnID]opt.ScalarExpr +} + +var _ invertedFilterPlanner = &tsqueryFilterPlanner{} + +// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner +// interface. +func (t *tsqueryFilterPlanner) extractInvertedFilterConditionFromLeaf( + _ context.Context, _ *eval.Context, expr opt.ScalarExpr, +) ( + invertedExpr inverted.Expression, + remainingFilters opt.ScalarExpr, + _ *invertedexpr.PreFiltererStateForInvertedFilterer, +) { + var constantVal opt.ScalarExpr + var left, right opt.ScalarExpr + switch e := expr.(type) { + case *memo.TSMatchesExpr: + left, right = e.Left, e.Right + default: + // Only the above types are supported. + return inverted.NonInvertedColExpression{}, expr, nil + } + if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) { + constantVal = right + } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) { + constantVal = left + } else { + // Can only accelerate with a single constant value. + return inverted.NonInvertedColExpression{}, expr, nil + } + d := memo.ExtractConstDatum(constantVal) + if d.ResolvedType() != types.TSQuery { + panic(errors.AssertionFailedf( + "trying to apply tsvector inverted index to unsupported type %s", d.ResolvedType(), + )) + } + q := d.(*tree.DTSQuery).TSQuery + var err error + invertedExpr, err = q.GetInvertedExpr() + if err != nil { + // An inverted expression could not be extracted. + return inverted.NonInvertedColExpression{}, expr, nil + } + + // If the extracted inverted expression is not tight then remaining filters + // must be applied after the inverted index scan. + if !invertedExpr.IsTight() { + remainingFilters = expr + } + + // We do not currently support pre-filtering for text search indexes, so + // the returned pre-filter state is nil. + return invertedExpr, remainingFilters, nil +} diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 1985ac0d3a4a..25d0871b771c 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -5924,8 +5924,8 @@ value if you rely on the HLC for accuracy.`, }, tree.Overload{ Types: tree.ParamTypes{ - {"val", types.TSVector}, - {"version", types.Int}, + {Name: "val", Typ: types.TSVector}, + {Name: "version", Typ: types.Int}, }, ReturnType: tree.FixedReturnType(types.Int), Fn: func(ctx *eval.Context, args tree.Datums) (tree.Datum, error) { diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 38f15bb26a83..e34b8c674011 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -443,7 +443,7 @@ var builtinOidsBySignature = map[string]oid.Oid{ `crdb_internal.num_inverted_index_entries(val: jsonb, version: int) -> int`: 1336, `crdb_internal.num_inverted_index_entries(val: string, version: int) -> int`: 1337, `crdb_internal.num_inverted_index_entries(val: anyelement[], version: int) -> int`: 1338, - `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`: 2055, + `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`: 2061, `crdb_internal.payloads_for_span(span_id: int) -> tuple{string AS payload_type, jsonb AS payload_jsonb}`: 349, `crdb_internal.payloads_for_trace(trace_id: int) -> tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}`: 350, `crdb_internal.pb_to_json(pbname: string, data: bytes) -> jsonb`: 1270, diff --git a/pkg/util/tsearch/BUILD.bazel b/pkg/util/tsearch/BUILD.bazel index beaf84a3e5b4..10c5e1152fae 100644 --- a/pkg/util/tsearch/BUILD.bazel +++ b/pkg/util/tsearch/BUILD.bazel @@ -14,6 +14,7 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch", visibility = ["//visibility:public"], deps = [ + "//pkg/sql/inverted", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go index 3772386811cc..224e16e55efd 100644 --- a/pkg/util/tsearch/encoding.go +++ b/pkg/util/tsearch/encoding.go @@ -461,11 +461,15 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) { func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) { outKeys := make([][]byte, 0, len(vector)) for i := range vector { - l := vector[i].lexeme - outKey := make([]byte, len(inKey), len(inKey)+len(l)) - copy(outKey, inKey) - newKey := encoding.EncodeStringAscending(outKey, l) + newKey := EncodeInvertedIndexKey(inKey, vector[i].lexeme) outKeys = append(outKeys, newKey) } return outKeys, nil } + +// EncodeInvertedIndexKey returns the inverted index key for the input lexeme. +func EncodeInvertedIndexKey(inKey []byte, lexeme string) []byte { + outKey := make([]byte, len(inKey), len(inKey)+len(lexeme)) + copy(outKey, inKey) + return encoding.EncodeStringAscending(outKey, lexeme) +} diff --git a/pkg/util/tsearch/tsquery.go b/pkg/util/tsearch/tsquery.go index 296247af6583..92b1423195ca 100644 --- a/pkg/util/tsearch/tsquery.go +++ b/pkg/util/tsearch/tsquery.go @@ -14,6 +14,7 @@ import ( "fmt" "strings" + "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/errors" @@ -169,6 +170,54 @@ func (q TSQuery) String() string { return q.root.String() } +// GetInvertedExpr returns the inverted expression that can be used to search +// an index. +func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) { + return q.root.getInvertedExpr() +} + +func (n *tsNode) getInvertedExpr() (inverted.Expression, error) { + switch n.op { + case invalid: + key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme) + span := inverted.MakeSingleValSpan(key) + return inverted.ExprForSpan(span, true), nil + case followedby: + fallthrough + case and: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil && rErr != nil { + // We need a positive match on at least one side. + return nil, lErr + } else if lErr != nil { + //nolint:returnerrcheck + return r, nil + } else if rErr != nil { + //nolint:returnerrcheck + return l, nil + } + return inverted.And(l, r), nil + case or: + l, lErr := n.l.getInvertedExpr() + r, rErr := n.r.getInvertedExpr() + if lErr != nil || rErr != nil { + // We need a positive match on both sides, so we return an error here. + // For example, searching for a | !b would require a full scan, since some + // documents could match that contain neither a nor b. + return nil, lErr + } + return inverted.Or(l, r), nil + case not: + // A not would require more advanced machinery than we have, so for now + // we'll just assume we can't perform an inverted expression search on a + // not. Note that a nested not would make it possible, but we are ignoring + // this case for now as it seems marginal. + return nil, errors.New("unable to create inverted expr for not") + } + return nil, errors.AssertionFailedf("invalid operator %d", n.op) +} + func lexTSQuery(input string) (TSVector, error) { parser := tsVectorLexer{ input: input,