Skip to content

Commit

Permalink
sql: inv idx accelerate tsvector@@tsquery queries
Browse files Browse the repository at this point in the history
This commit adds inverted index acceleration for expressions that
evaluate a tsquery against a tsvector using the `@@` operator.

Release note (sql change): it's now possible to run efficient tsvector
@@ tsquery searches when there is an inverted index on the tsvector
column being searched.
  • Loading branch information
jordanlewis committed Dec 16, 2022
1 parent 90de576 commit 50871a3
Show file tree
Hide file tree
Showing 11 changed files with 325 additions and 9 deletions.
2 changes: 2 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -3119,6 +3119,8 @@ active for the current transaction.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.num_inverted_index_entries"></a><code>crdb_internal.num_inverted_index_entries(val: jsonb, version: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>This function is used only by CockroachDB’s developers for testing purposes.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.num_inverted_index_entries"></a><code>crdb_internal.num_inverted_index_entries(val: tsvector, version: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>This function is used only by CockroachDB’s developers for testing purposes.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.payloads_for_span"></a><code>crdb_internal.payloads_for_span(span_id: <a href="int.html">int</a>) &rarr; tuple{string AS payload_type, jsonb AS payload_jsonb}</code></td><td><span class="funcdesc"><p>Returns the payload(s) of the requested span and all its children.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.payloads_for_trace"></a><code>crdb_internal.payloads_for_trace(trace_id: <a href="int.html">int</a>) &rarr; tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}</code></td><td><span class="funcdesc"><p>Returns the payload(s) of the requested trace.</p>
Expand Down
159 changes: 159 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# LogicTest: local

statement ok
CREATE TABLE a (
a INT PRIMARY KEY,
b TSVECTOR,
c TSQUERY,
FAMILY (a,b,c),
INVERTED INDEX(b)
)

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 1
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 1
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
missing stats
table: a@a_b_idx
spans: 3 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 1
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

# Test that tsvector indexes can't accelerate the @@ operator with no constant
# columns.
statement error index \"a_b_idx\" is inverted and cannot be used for this query
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c
7 changes: 7 additions & 0 deletions pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/sql/opt/invertedidx/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
"inverted_index_expr.go",
"json_array.go",
"trigram.go",
"tsearch.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx",
visibility = ["//visibility:public"],
Expand Down
13 changes: 11 additions & 2 deletions pkg/sql/opt/invertedidx/inverted_index_expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,27 @@ func TryFilterInvertedIndex(
} else {
col := index.InvertedColumn().InvertedSourceColumnOrdinal()
typ = factory.Metadata().Table(tabID).Column(col).DatumType()
if typ.Family() == types.StringFamily {
switch typ.Family() {
case types.StringFamily:
filterPlanner = &trigramFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
} else {
case types.TSVectorFamily:
filterPlanner = &tsqueryFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
case types.JsonFamily, types.ArrayFamily:
filterPlanner = &jsonOrArrayFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
default:
return nil, nil, nil, nil, false
}
}

Expand Down
84 changes: 84 additions & 0 deletions pkg/sql/opt/invertedidx/tsearch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package invertedidx

import (
"context"

"github.com/cockroachdb/cockroach/pkg/sql/inverted"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
"github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr"
"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/errors"
)

type tsqueryFilterPlanner struct {
tabID opt.TableID
index cat.Index
computedColumns map[opt.ColumnID]opt.ScalarExpr
}

var _ invertedFilterPlanner = &tsqueryFilterPlanner{}

// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner
// interface.
func (t *tsqueryFilterPlanner) extractInvertedFilterConditionFromLeaf(
_ context.Context, _ *eval.Context, expr opt.ScalarExpr,
) (
invertedExpr inverted.Expression,
remainingFilters opt.ScalarExpr,
_ *invertedexpr.PreFiltererStateForInvertedFilterer,
) {
var constantVal opt.ScalarExpr
var left, right opt.ScalarExpr
switch e := expr.(type) {
case *memo.TSMatchesExpr:
left, right = e.Left, e.Right
default:
// Only the above types are supported.
return inverted.NonInvertedColExpression{}, expr, nil
}
if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) {
constantVal = right
} else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) {
constantVal = left
} else {
// Can only accelerate with a single constant value.
return inverted.NonInvertedColExpression{}, expr, nil
}
d := memo.ExtractConstDatum(constantVal)
if d.ResolvedType() != types.TSQuery {
panic(errors.AssertionFailedf(
"trying to apply tsvector inverted index to unsupported type %s", d.ResolvedType(),
))
}
q := d.(*tree.DTSQuery).TSQuery
var err error
invertedExpr, err = q.GetInvertedExpr()
if err != nil {
// An inverted expression could not be extracted.
return inverted.NonInvertedColExpression{}, expr, nil
}

// If the extracted inverted expression is not tight then remaining filters
// must be applied after the inverted index scan.
if !invertedExpr.IsTight() {
remainingFilters = expr
}

// We do not currently support pre-filtering for text search indexes, so
// the returned pre-filter state is nil.
return invertedExpr, remainingFilters, nil
}
4 changes: 2 additions & 2 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -5924,8 +5924,8 @@ value if you rely on the HLC for accuracy.`,
},
tree.Overload{
Types: tree.ParamTypes{
{"val", types.TSVector},
{"version", types.Int},
{Name: "val", Typ: types.TSVector},
{Name: "version", Typ: types.Int},
},
ReturnType: tree.FixedReturnType(types.Int),
Fn: func(ctx *eval.Context, args tree.Datums) (tree.Datum, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ var builtinOidsBySignature = map[string]oid.Oid{
`crdb_internal.num_inverted_index_entries(val: jsonb, version: int) -> int`: 1336,
`crdb_internal.num_inverted_index_entries(val: string, version: int) -> int`: 1337,
`crdb_internal.num_inverted_index_entries(val: anyelement[], version: int) -> int`: 1338,
`crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`: 2055,
`crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`: 2061,
`crdb_internal.payloads_for_span(span_id: int) -> tuple{string AS payload_type, jsonb AS payload_jsonb}`: 349,
`crdb_internal.payloads_for_trace(trace_id: int) -> tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}`: 350,
`crdb_internal.pb_to_json(pbname: string, data: bytes) -> jsonb`: 1270,
Expand Down
1 change: 1 addition & 0 deletions pkg/util/tsearch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch",
visibility = ["//visibility:public"],
deps = [
"//pkg/sql/inverted",
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
"//pkg/util/encoding",
Expand Down
12 changes: 8 additions & 4 deletions pkg/util/tsearch/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -461,11 +461,15 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) {
func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) {
outKeys := make([][]byte, 0, len(vector))
for i := range vector {
l := vector[i].lexeme
outKey := make([]byte, len(inKey), len(inKey)+len(l))
copy(outKey, inKey)
newKey := encoding.EncodeStringAscending(outKey, l)
newKey := EncodeInvertedIndexKey(inKey, vector[i].lexeme)
outKeys = append(outKeys, newKey)
}
return outKeys, nil
}

// EncodeInvertedIndexKey returns the inverted index key for the input lexeme.
func EncodeInvertedIndexKey(inKey []byte, lexeme string) []byte {
outKey := make([]byte, len(inKey), len(inKey)+len(lexeme))
copy(outKey, inKey)
return encoding.EncodeStringAscending(outKey, lexeme)
}
Loading

0 comments on commit 50871a3

Please sign in to comment.