From d07f41b46a95ed8c885c3ae61225f38e065fc77c Mon Sep 17 00:00:00 2001
From: Jordan Lewis
Date: Fri, 22 Jul 2022 23:35:06 -0400
Subject: [PATCH 1/2] sql: permit inverted indexes on tsvector
This commit adds the ability to create an inverted index on a tsvector
column.
Release note: None
---
docs/generated/sql/functions.md | 2 ++
pkg/sql/catalog/colinfo/col_type_info.go | 1 +
pkg/sql/create_index.go | 6 ++++++
pkg/sql/rowenc/BUILD.bazel | 1 +
pkg/sql/rowenc/encoded_datum.go | 2 +-
pkg/sql/rowenc/index_encoding.go | 3 +++
pkg/sql/sem/builtins/builtins.go | 14 ++++++++++++++
pkg/sql/sem/builtins/fixed_oids.go | 1 +
pkg/util/tsearch/encoding.go | 14 ++++++++++++++
9 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md
index a626bc951097..cd172954169a 100644
--- a/docs/generated/sql/functions.md
+++ b/docs/generated/sql/functions.md
@@ -3197,6 +3197,8 @@ active for the current transaction.
crdb_internal.payloads_for_trace(trace_id: int) → tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb} | Returns the payload(s) of the requested trace.
diff --git a/pkg/sql/catalog/colinfo/col_type_info.go b/pkg/sql/catalog/colinfo/col_type_info.go
index 92bc82ea9a16..98d21abdbc5e 100644
--- a/pkg/sql/catalog/colinfo/col_type_info.go
+++ b/pkg/sql/catalog/colinfo/col_type_info.go
@@ -165,6 +165,7 @@ func ColumnTypeIsOnlyInvertedIndexable(t *types.T) bool {
case types.JsonFamily:
case types.GeographyFamily:
case types.GeometryFamily:
+ case types.TSVectorFamily:
default:
return false
}
diff --git a/pkg/sql/create_index.go b/pkg/sql/create_index.go
index f7df6e09f826..42df6c594ccd 100644
--- a/pkg/sql/create_index.go
+++ b/pkg/sql/create_index.go
@@ -404,6 +404,12 @@ func populateInvertedIndexDescriptor(
return newUndefinedOpclassError(invCol.OpClass)
}
indexDesc.InvertedColumnKinds[0] = catpb.InvertedIndexColumnKind_TRIGRAM
+ case types.TSVectorFamily:
+ switch invCol.OpClass {
+ case "tsvector_ops", "":
+ default:
+ return newUndefinedOpclassError(invCol.OpClass)
+ }
default:
return tabledesc.NewInvalidInvertedColumnError(column.GetName(), column.GetType().Name())
}
diff --git a/pkg/sql/rowenc/BUILD.bazel b/pkg/sql/rowenc/BUILD.bazel
index 1dca4c1a6bc9..0ba5b9748a94 100644
--- a/pkg/sql/rowenc/BUILD.bazel
+++ b/pkg/sql/rowenc/BUILD.bazel
@@ -39,6 +39,7 @@ go_library(
"//pkg/util/mon",
"//pkg/util/protoutil",
"//pkg/util/trigram",
+ "//pkg/util/tsearch",
"//pkg/util/unique",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
diff --git a/pkg/sql/rowenc/encoded_datum.go b/pkg/sql/rowenc/encoded_datum.go
index a68883c8dbe9..44386dfe1be3 100644
--- a/pkg/sql/rowenc/encoded_datum.go
+++ b/pkg/sql/rowenc/encoded_datum.go
@@ -324,7 +324,7 @@ func (ed *EncDatum) Fingerprint(
var err error
memUsageBefore := ed.Size()
switch typ.Family() {
- case types.JsonFamily:
+ case types.JsonFamily, types.TSVectorFamily:
if err = ed.EnsureDecoded(typ, a); err != nil {
return nil, err
}
diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go
index eaf59f1fe03b..2478092f4f08 100644
--- a/pkg/sql/rowenc/index_encoding.go
+++ b/pkg/sql/rowenc/index_encoding.go
@@ -38,6 +38,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/trigram"
+ "github.com/cockroachdb/cockroach/pkg/util/tsearch"
"github.com/cockroachdb/cockroach/pkg/util/unique"
"github.com/cockroachdb/errors"
)
@@ -605,6 +606,8 @@ func EncodeInvertedIndexTableKeys(
// val could be a DOidWrapper, so we need to use the unwrapped datum
// here.
return encodeTrigramInvertedIndexTableKeys(string(*datum.(*tree.DString)), inKey, version, true /* pad */)
+ case types.TSVectorFamily:
+ return tsearch.EncodeInvertedIndexKeys(inKey, val.(*tree.DTSVector).TSVector)
}
return nil, errors.AssertionFailedf("trying to apply inverted index to unsupported type %s", datum.ResolvedType())
}
diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go
index 3eec3a410e2a..7cc613e81c46 100644
--- a/pkg/sql/sem/builtins/builtins.go
+++ b/pkg/sql/sem/builtins/builtins.go
@@ -5945,6 +5945,20 @@ value if you rely on the HLC for accuracy.`,
Volatility: volatility.Stable,
CalledOnNullInput: true,
},
+ tree.Overload{
+ Types: tree.ParamTypes{
+ {Name: "val", Typ: types.TSVector},
+ {Name: "version", Typ: types.Int},
+ },
+ ReturnType: tree.FixedReturnType(types.Int),
+ Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
+ val := args[0].(*tree.DTSVector)
+ return tree.NewDInt(tree.DInt(len(val.TSVector))), nil
+ },
+ Info: "This function is used only by CockroachDB's developers for testing purposes.",
+ Volatility: volatility.Stable,
+ CalledOnNullInput: true,
+ },
),
// Returns true iff the current user has admin role.
diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go
index ef171a208244..9c6035efa79f 100644
--- a/pkg/sql/sem/builtins/fixed_oids.go
+++ b/pkg/sql/sem/builtins/fixed_oids.go
@@ -2043,6 +2043,7 @@ var builtinOidsArray = []string{
2067: `crdb_internal.gen_rand_ident(name_pattern: string, count: int) -> string`,
2068: `crdb_internal.gen_rand_ident(name_pattern: string, count: int, parameters: jsonb) -> string`,
2069: `crdb_internal.create_tenant(parameters: jsonb) -> int`,
+ 2070: `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`,
}
var builtinOidsBySignature map[string]oid.Oid
diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go
index 3f898ad0e68e..3772386811cc 100644
--- a/pkg/util/tsearch/encoding.go
+++ b/pkg/util/tsearch/encoding.go
@@ -455,3 +455,17 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) {
}
return b, ret, nil
}
+
+// EncodeInvertedIndexKeys returns a slice of byte slices, one per inverted
+// index key for the terms in this tsvector.
+func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) {
+ outKeys := make([][]byte, 0, len(vector))
+ for i := range vector {
+ l := vector[i].lexeme
+ outKey := make([]byte, len(inKey), len(inKey)+len(l))
+ copy(outKey, inKey)
+ newKey := encoding.EncodeStringAscending(outKey, l)
+ outKeys = append(outKeys, newKey)
+ }
+ return outKeys, nil
+}
From 5dd8ae2675952655d665239ded06a1d4b37698b1 Mon Sep 17 00:00:00 2001
From: Jordan Lewis
Date: Sun, 24 Jul 2022 21:51:02 -0400
Subject: [PATCH 2/2] sql: inv idx accelerate tsvector@@tsquery queries
This commit adds inverted index acceleration for expressions that
evaluate a tsquery against a tsvector using the `@@` operator.
Release note (sql change): it's now possible to run efficient tsvector
@@ tsquery searches when there is an inverted index on the tsvector
column being searched.
---
pkg/sql/inverted/expression.go | 16 +-
.../logictest/testdata/logic_test/tsvector | 96 ++++++++++
.../exec/execbuilder/testdata/tsvector_index | 171 ++++++++++++++++++
.../execbuilder/tests/local/generated_test.go | 7 +
pkg/sql/opt/invertedidx/BUILD.bazel | 2 +
.../opt/invertedidx/inverted_index_expr.go | 13 +-
pkg/sql/opt/invertedidx/tsearch.go | 87 +++++++++
pkg/sql/opt/invertedidx/tsearch_test.go | 132 ++++++++++++++
pkg/sql/opt/xform/select_funcs.go | 4 +
pkg/util/tsearch/BUILD.bazel | 3 +
pkg/util/tsearch/encoding.go | 14 +-
pkg/util/tsearch/encoding_test.go | 142 +++++++++++++++
pkg/util/tsearch/random.go | 35 +++-
pkg/util/tsearch/tsquery.go | 101 +++++++++++
14 files changed, 807 insertions(+), 16 deletions(-)
create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
create mode 100644 pkg/sql/opt/invertedidx/tsearch.go
create mode 100644 pkg/sql/opt/invertedidx/tsearch_test.go
diff --git a/pkg/sql/inverted/expression.go b/pkg/sql/inverted/expression.go
index 6d4ab4ac3e5d..8b83506b7c27 100644
--- a/pkg/sql/inverted/expression.go
+++ b/pkg/sql/inverted/expression.go
@@ -300,11 +300,17 @@ type SpanExpression struct {
// Tight mirrors the definition of IsTight().
Tight bool
- // Unique is true if the spans are guaranteed not to produce duplicate
- // primary keys. Otherwise, Unique is false. Unique may be true for certain
- // JSON or Array SpanExpressions, and it holds when unique SpanExpressions
- // are combined with And. It does not hold when these SpanExpressions are
- // combined with Or.
+ // Unique is true if the spans in FactoredUnionSpans are guaranteed not to
+ // produce duplicate primary keys. Otherwise, Unique is false. Unique may
+ // be true for certain JSON or Array SpanExpressions, and it holds when
+ // unique SpanExpressions are combined with And. It does not hold when
+ // these SpanExpressions are combined with Or.
+ //
+ // Once a SpanExpression is built, this field is relevant if the root
+ // SpanExpression has no children (i.e., Operator is None). In this case,
+ // Unique is used to determine whether an invertedFilter is needed on top
+ // of the inverted index scan to deduplicate keys (an invertedFilter is
+ // always necessary if Operator is not None).
Unique bool
// SpansToRead are the spans to read from the inverted index
diff --git a/pkg/sql/logictest/testdata/logic_test/tsvector b/pkg/sql/logictest/testdata/logic_test/tsvector
index 71398395526b..65ffb7d1428c 100644
--- a/pkg/sql/logictest/testdata/logic_test/tsvector
+++ b/pkg/sql/logictest/testdata/logic_test/tsvector
@@ -139,3 +139,99 @@ query T
VALUES ( json_build_array($$'cat' & 'rat'$$:::TSQUERY)::JSONB)
----
["'cat' & 'rat'"]
+
+# Test tsvector inverted indexes.
+statement ok
+DROP TABLE a;
+CREATE TABLE a (
+ a TSVECTOR,
+ b TSQUERY,
+ INVERTED INDEX(a)
+);
+INSERT INTO a VALUES('foo:3 bar:4,5'), ('baz:1'), ('foo:3'), ('bar:2')
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo'
+----
+'bar':4,5 'foo':3
+'foo':3
+
+statement error index \"a_a_idx\" is inverted and cannot be used for this query
+SELECT a FROM a@a_a_idx WHERE a @@ '!foo'
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo' OR a @@ 'bar'
+----
+'bar':4,5 'foo':3
+'foo':3
+'bar':2
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar'
+----
+'bar':4,5 'foo':3
+'foo':3
+'bar':2
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' OR a @@ 'baz'
+----
+'bar':4,5 'foo':3
+'baz':1
+'foo':3
+'bar':2
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo & bar'
+----
+'bar':4,5 'foo':3
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> bar'
+----
+'bar':4,5 'foo':3
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'bar <-> foo'
+----
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> !bar'
+----
+'foo':3
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ '!baz <-> bar'
+----
+'bar':4,5 'foo':3
+'bar':2
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'foo & !bar'
+----
+'foo':3
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'ba:*'
+----
+'bar':4,5 'foo':3
+'baz':1
+'bar':2
+
+query T rowsort
+SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* | foo'
+----
+'bar':4,5 'foo':3
+'baz':1
+'foo':3
+'bar':2
+
+query T
+SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* & foo'
+----
+'bar':4,5 'foo':3
+
+# Test that tsvector indexes can't accelerate the @@ operator with no constant
+# columns.
+statement error index \"a_a_idx\" is inverted and cannot be used for this query
+EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
new file mode 100644
index 000000000000..ee8b0bd9c576
--- /dev/null
+++ b/pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
@@ -0,0 +1,171 @@
+# LogicTest: local
+
+statement ok
+CREATE TABLE a (
+ a INT PRIMARY KEY,
+ b TSVECTOR,
+ c TSQUERY,
+ FAMILY (a,b,c),
+ INVERTED INDEX(b)
+)
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ table: a@a_pkey
+│
+└── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 1 span
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ table: a@a_pkey
+│
+└── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 1 span
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ table: a@a_pkey
+│
+└── • inverted filter
+ │ inverted column: b_inverted_key
+ │ num spans: 2
+ │
+ └── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 2 spans
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ table: a@a_pkey
+│
+└── • inverted filter
+ │ inverted column: b_inverted_key
+ │ num spans: 2
+ │
+ └── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 2 spans
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ table: a@a_pkey
+│
+└── • inverted filter
+ │ inverted column: b_inverted_key
+ │ num spans: 3
+ │
+ └── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 3 spans
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar'
+----
+distribution: local
+vectorized: true
+·
+• lookup join
+│ table: a@a_pkey
+│ equality: (a) = (a)
+│ equality cols are key
+│ pred: b @@ '''foo'' & ''bar'''
+│
+└── • zigzag join
+ left table: a@a_b_idx
+ left columns: (a, b_inverted_key)
+ left fixed values: 1 column
+ right table: a@a_b_idx
+ right columns: (a, b_inverted_key)
+ right fixed values: 1 column
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar'
+----
+distribution: local
+vectorized: true
+·
+• lookup join
+│ table: a@a_pkey
+│ equality: (a) = (a)
+│ equality cols are key
+│ pred: b @@ '''foo'' <-> ''bar'''
+│
+└── • zigzag join
+ left table: a@a_b_idx
+ left columns: (a, b_inverted_key)
+ left fixed values: 1 column
+ right table: a@a_b_idx
+ right columns: (a, b_inverted_key)
+ right fixed values: 1 column
+
+query T
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar'
+----
+distribution: local
+vectorized: true
+·
+• filter
+│ filter: b @@ '''foo'' & !''bar'''
+│
+└── • index join
+ │ table: a@a_pkey
+ │
+ └── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 1 span
+
+
+query T
+EXPLAIN SELECT a FROM a@a_b_idx WHERE b @@ 'ba:*'
+----
+distribution: local
+vectorized: true
+·
+• inverted filter
+│ inverted column: b_inverted_key
+│ num spans: 1
+│
+└── • scan
+ missing stats
+ table: a@a_b_idx
+ spans: 1 span
+
+
+# Test that tsvector indexes can't accelerate the @@ operator with no constant
+# columns.
+statement error index \"a_b_idx\" is inverted and cannot be used for this query
+EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c
diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go
index ffac05a8f9f0..a2832beb63b7 100644
--- a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go
+++ b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go
@@ -550,6 +550,13 @@ func TestExecBuild_trigram_index(
runExecBuildLogicTest(t, "trigram_index")
}
+func TestExecBuild_tsvector_index(
+ t *testing.T,
+) {
+ defer leaktest.AfterTest(t)()
+ runExecBuildLogicTest(t, "tsvector_index")
+}
+
func TestExecBuild_tuple(
t *testing.T,
) {
diff --git a/pkg/sql/opt/invertedidx/BUILD.bazel b/pkg/sql/opt/invertedidx/BUILD.bazel
index 8730f991ec61..7fa01ba26944 100644
--- a/pkg/sql/opt/invertedidx/BUILD.bazel
+++ b/pkg/sql/opt/invertedidx/BUILD.bazel
@@ -8,6 +8,7 @@ go_library(
"inverted_index_expr.go",
"json_array.go",
"trigram.go",
+ "tsearch.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx",
visibility = ["//visibility:public"],
@@ -49,6 +50,7 @@ go_test(
"geo_test.go",
"json_array_test.go",
"trigram_test.go",
+ "tsearch_test.go",
],
args = ["-test.timeout=55s"],
deps = [
diff --git a/pkg/sql/opt/invertedidx/inverted_index_expr.go b/pkg/sql/opt/invertedidx/inverted_index_expr.go
index eec56289d01b..6db743491e87 100644
--- a/pkg/sql/opt/invertedidx/inverted_index_expr.go
+++ b/pkg/sql/opt/invertedidx/inverted_index_expr.go
@@ -116,18 +116,27 @@ func TryFilterInvertedIndex(
} else {
col := index.InvertedColumn().InvertedSourceColumnOrdinal()
typ = factory.Metadata().Table(tabID).Column(col).DatumType()
- if typ.Family() == types.StringFamily {
+ switch typ.Family() {
+ case types.StringFamily:
filterPlanner = &trigramFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
- } else {
+ case types.TSVectorFamily:
+ filterPlanner = &tsqueryFilterPlanner{
+ tabID: tabID,
+ index: index,
+ computedColumns: computedColumns,
+ }
+ case types.JsonFamily, types.ArrayFamily:
filterPlanner = &jsonOrArrayFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
+ default:
+ return nil, nil, nil, nil, false
}
}
diff --git a/pkg/sql/opt/invertedidx/tsearch.go b/pkg/sql/opt/invertedidx/tsearch.go
new file mode 100644
index 000000000000..5e4dc2a7d728
--- /dev/null
+++ b/pkg/sql/opt/invertedidx/tsearch.go
@@ -0,0 +1,87 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package invertedidx
+
+import (
+ "context"
+
+ "github.com/cockroachdb/cockroach/pkg/sql/inverted"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedexpr"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/cockroachdb/cockroach/pkg/sql/types"
+ "github.com/cockroachdb/errors"
+)
+
+type tsqueryFilterPlanner struct {
+ tabID opt.TableID
+ index cat.Index
+ computedColumns map[opt.ColumnID]opt.ScalarExpr
+}
+
+var _ invertedFilterPlanner = &tsqueryFilterPlanner{}
+
+// extractInvertedFilterConditionFromLeaf implements the invertedFilterPlanner
+// interface.
+func (t *tsqueryFilterPlanner) extractInvertedFilterConditionFromLeaf(
+ _ context.Context, _ *eval.Context, expr opt.ScalarExpr,
+) (
+ invertedExpr inverted.Expression,
+ remainingFilters opt.ScalarExpr,
+ _ *invertedexpr.PreFiltererStateForInvertedFilterer,
+) {
+ var constantVal opt.ScalarExpr
+ var left, right opt.ScalarExpr
+ switch e := expr.(type) {
+ case *memo.TSMatchesExpr:
+ left, right = e.Left, e.Right
+ default:
+ // Only the above types are supported.
+ return inverted.NonInvertedColExpression{}, expr, nil
+ }
+ if isIndexColumn(t.tabID, t.index, left, t.computedColumns) && memo.CanExtractConstDatum(right) {
+ constantVal = right
+ } else if isIndexColumn(t.tabID, t.index, right, t.computedColumns) && memo.CanExtractConstDatum(left) {
+ constantVal = left
+ } else {
+ // Can only accelerate with a single constant value.
+ return inverted.NonInvertedColExpression{}, expr, nil
+ }
+ d := memo.ExtractConstDatum(constantVal)
+ if d.ResolvedType() != types.TSQuery {
+ panic(errors.AssertionFailedf(
+ "trying to apply tsvector inverted index to unsupported type %s", d.ResolvedType(),
+ ))
+ }
+ q := d.(*tree.DTSQuery).TSQuery
+ var err error
+ invertedExpr, err = q.GetInvertedExpr()
+ if err != nil {
+ // An inverted expression could not be extracted.
+ return inverted.NonInvertedColExpression{}, expr, nil
+ }
+
+ // If the extracted inverted expression is not tight then remaining filters
+ // must be applied after the inverted index scan.
+ // TODO(jordan): we could do better here by pruning terms that we successfully
+ // turn into inverted expressions during the tsquery tree walk. We'd need to
+ // implement a function that removes a term from a tsquery tree.
+ if !invertedExpr.IsTight() {
+ remainingFilters = expr
+ }
+
+ // We do not currently support pre-filtering for text search indexes, so
+ // the returned pre-filter state is nil.
+ return invertedExpr, remainingFilters, nil
+}
diff --git a/pkg/sql/opt/invertedidx/tsearch_test.go b/pkg/sql/opt/invertedidx/tsearch_test.go
new file mode 100644
index 000000000000..51aa8628fa3b
--- /dev/null
+++ b/pkg/sql/opt/invertedidx/tsearch_test.go
@@ -0,0 +1,132 @@
+// Copyright 2023 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package invertedidx_test
+
+import (
+ "context"
+ "testing"
+
+ "github.com/cockroachdb/cockroach/pkg/settings/cluster"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/norm"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils"
+ "github.com/cockroachdb/cockroach/pkg/sql/opt/testutils/testcat"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/stretchr/testify/require"
+)
+
+func TestTryFilterTSVector(t *testing.T) {
+ semaCtx := tree.MakeSemaContext()
+ st := cluster.MakeTestingClusterSettings()
+ evalCtx := eval.NewTestingEvalContext(st)
+
+ tc := testcat.New()
+ if _, err := tc.ExecuteDDL(
+ "CREATE TABLE t (t tsvector, INVERTED INDEX (t))",
+ ); err != nil {
+ t.Fatal(err)
+ }
+ var f norm.Factory
+ f.Init(context.Background(), evalCtx, tc)
+ md := f.Metadata()
+ tn := tree.NewUnqualifiedTableName("t")
+ tab := md.AddTable(tc.Table(tn), tn)
+ tsvectorOrd := 1
+
+ // If we can create an inverted filter with the given filter expression and
+ // index, ok=true. If the spans in the resulting inverted index constraint
+ // do not have duplicate primary keys, unique=true. If the spans are tight,
+ // tight=true and remainingFilters="". Otherwise, tight is false and
+ // remainingFilters contains some or all of the original filters.
+ testCases := []struct {
+ filters string
+ ok bool
+ tight bool
+ unique bool
+ }{
+ {filters: "t @@ 'a'", ok: true, tight: true, unique: true},
+ {filters: "t @@ '!a'", ok: false, tight: false, unique: false},
+ // Prefix match.
+ {filters: "t @@ 'a:*'", ok: true, tight: true, unique: false},
+ // Weight match.
+ {filters: "t @@ 'a:C'", ok: true, tight: false, unique: true},
+ // Weight and prefix match.
+ {filters: "t @@ 'a:C*'", ok: true, tight: false, unique: false},
+
+ {filters: "t @@ 'a | b'", ok: true, tight: true, unique: false},
+ {filters: "t @@ 'a & b'", ok: true, tight: true, unique: true},
+ {filters: "t @@ 'a <-> b'", ok: true, tight: false, unique: true},
+
+ // Can't filter with ! in an or clause.
+ {filters: "t @@ '!a | b'", ok: false, tight: false, unique: false},
+ {filters: "t @@ 'a | !b'", ok: false, tight: false, unique: false},
+ {filters: "t @@ '!a | !b'", ok: false, tight: false, unique: false},
+
+ // ! in an and clause is okay - we just re-filter on the ! term.
+ {filters: "t @@ 'a & !b'", ok: true, tight: false, unique: true},
+ {filters: "t @@ '!a & b'", ok: true, tight: false, unique: true},
+ // But not if both are !.
+ {filters: "t @@ '!a & !b'", ok: false, tight: false, unique: false},
+
+ // Same as above, except <-> matches are never tight - they always require
+ // re-checking because the index doesn't store the lexeme positions.
+ {filters: "t @@ 'a <-> !b'", ok: true, tight: false, unique: true},
+ {filters: "t @@ '!a <-> b'", ok: true, tight: false, unique: true},
+ {filters: "t @@ '!a <-> !b'", ok: false, tight: false, unique: false},
+
+ // Some sanity checks for more than 2 terms, to make sure that the output
+ // de-uniqueifies as we travel up the tree with more than 1 lexeme seen.
+ {filters: "t @@ '(a & !b) | c'", ok: true, tight: false, unique: false},
+ {filters: "t @@ '(a & b) | c'", ok: true, tight: true, unique: false},
+ {filters: "t @@ '(a & b) <-> !(c | d)'", ok: true, tight: false, unique: true},
+ }
+
+ for _, tc := range testCases {
+ t.Logf("test case: %v", tc)
+ filters := testutils.BuildFilters(t, &f, &semaCtx, evalCtx, tc.filters)
+
+ // We're not testing that the correct SpanExpression is returned here;
+ // that is tested elsewhere. This is just testing that we are constraining
+ // the index when we expect to and we have the correct values for tight,
+ // unique, and remainingFilters.
+ spanExpr, _, remainingFilters, _, ok := invertedidx.TryFilterInvertedIndex(
+ context.Background(),
+ evalCtx,
+ &f,
+ filters,
+ nil, /* optionalFilters */
+ tab,
+ md.Table(tab).Index(tsvectorOrd),
+ nil, /* computedColumns */
+ )
+ if tc.ok != ok {
+ t.Fatalf("expected %v, got %v", tc.ok, ok)
+ }
+ if !ok {
+ continue
+ }
+
+ if tc.tight != spanExpr.Tight {
+ t.Fatalf("For (%s), expected tight=%v, but got %v", tc.filters, tc.tight, spanExpr.Tight)
+ }
+ if tc.unique != spanExpr.Unique {
+ t.Fatalf("For (%s), expected unique=%v, but got %v", tc.filters, tc.unique, spanExpr.Unique)
+ }
+
+ if tc.tight {
+ require.True(t, remainingFilters.IsTrue())
+ } else {
+ require.Equal(t, filters.String(), remainingFilters.String(),
+ "mismatched remaining filters")
+ }
+ }
+}
diff --git a/pkg/sql/opt/xform/select_funcs.go b/pkg/sql/opt/xform/select_funcs.go
index 0f625ca2ce25..e3de897cd584 100644
--- a/pkg/sql/opt/xform/select_funcs.go
+++ b/pkg/sql/opt/xform/select_funcs.go
@@ -933,6 +933,10 @@ func (c *CustomFuncs) GenerateInvertedIndexScans(
// produce duplicate primary keys or requires at least one UNION or
// INTERSECTION. In this case, we must scan both the primary key columns
// and the inverted key column.
+ // The reason we also check !spanExpr.Unique here is that sometimes we
+ // eliminate the UNION operator in the tree, replacing it with a non-nil
+ // FactoredUnionSpans in the SpanExpression, and that case needs to be
+ // noticed and filtered.
needInvertedFilter := !spanExpr.Unique || spanExpr.Operator != inverted.None
newScanPrivate.Cols = pkCols.Copy()
var invertedCol opt.ColumnID
diff --git a/pkg/util/tsearch/BUILD.bazel b/pkg/util/tsearch/BUILD.bazel
index beaf84a3e5b4..f2c99b8bb705 100644
--- a/pkg/util/tsearch/BUILD.bazel
+++ b/pkg/util/tsearch/BUILD.bazel
@@ -14,6 +14,8 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch",
visibility = ["//visibility:public"],
deps = [
+ "//pkg/keysbase",
+ "//pkg/sql/inverted",
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
"//pkg/util/encoding",
@@ -32,6 +34,7 @@ go_test(
args = ["-test.timeout=295s"],
embed = [":tsearch"],
deps = [
+ "//pkg/sql/inverted",
"//pkg/testutils/skip",
"//pkg/util/randutil",
"@com_github_jackc_pgx_v4//:pgx",
diff --git a/pkg/util/tsearch/encoding.go b/pkg/util/tsearch/encoding.go
index 3772386811cc..fd1b9173621b 100644
--- a/pkg/util/tsearch/encoding.go
+++ b/pkg/util/tsearch/encoding.go
@@ -460,12 +460,18 @@ func (c *tsNodeCodec) decodeTSNodePGBinary(b []byte) ([]byte, *tsNode, error) {
// index key for the terms in this tsvector.
func EncodeInvertedIndexKeys(inKey []byte, vector TSVector) ([][]byte, error) {
outKeys := make([][]byte, 0, len(vector))
+ // Note that by construction, TSVector contains only unique terms, so we don't
+ // need to de-duplicate terms when constructing the inverted index keys.
for i := range vector {
- l := vector[i].lexeme
- outKey := make([]byte, len(inKey), len(inKey)+len(l))
- copy(outKey, inKey)
- newKey := encoding.EncodeStringAscending(outKey, l)
+ newKey := EncodeInvertedIndexKey(inKey, vector[i].lexeme)
outKeys = append(outKeys, newKey)
}
return outKeys, nil
}
+
+// EncodeInvertedIndexKey returns the inverted index key for the input lexeme.
+func EncodeInvertedIndexKey(inKey []byte, lexeme string) []byte {
+ outKey := make([]byte, len(inKey), len(inKey)+len(lexeme))
+ copy(outKey, inKey)
+ return encoding.EncodeStringAscending(outKey, lexeme)
+}
diff --git a/pkg/util/tsearch/encoding_test.go b/pkg/util/tsearch/encoding_test.go
index 7d0bfe120144..083a6f94748f 100644
--- a/pkg/util/tsearch/encoding_test.go
+++ b/pkg/util/tsearch/encoding_test.go
@@ -13,6 +13,7 @@ package tsearch
import (
"testing"
+ "github.com/cockroachdb/cockroach/pkg/sql/inverted"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/stretchr/testify/assert"
)
@@ -66,3 +67,144 @@ func TestRoundtripRandomTSQuery(t *testing.T) {
assert.Equal(t, encoded, reEncoded)
}
}
+
+func TestEncodeTSQueryInvertedIndexSpans(t *testing.T) {
+ testCases := []struct {
+ vector string
+ query string
+ expected bool
+ tight bool
+ unique bool
+ }{
+ // This test uses EncodeInvertedIndexKeys and
+ // GetInvertedExpr to determine whether the tsquery matches the tsvector. If
+ // the vector @@ query, expected is true. Otherwise expected is false. If
+ // the spans produced for contains are tight, tight is true. Otherwise tight
+ // is false.
+ //
+ // If GetInvertedExpr produces spans that are guaranteed not to
+ // contain duplicate primary keys, unique is true. Otherwise it is false.
+ {`a:2`, `a`, true, true, true},
+ {`b:2`, `a`, false, true, true},
+
+ {`'foo'`, `'foo'`, true, true, true},
+
+ {`a:2`, `a & b`, false, true, true},
+ {`a:1 b:2`, `a & b`, true, true, true},
+
+ {`a:2`, `a | b`, true, true, false},
+ {`a:1 b:2`, `a | b`, true, true, false},
+ {`c:1`, `a | b`, false, true, false},
+
+ {`a:1`, `a <-> b`, false, false, true},
+ {`a:1 b:2`, `a <-> b`, true, false, true},
+ {`a:1 b:3`, `a <-> b`, false, false, true},
+
+ {`a:1 b:2`, `a <-> (b|c)`, true, false, false},
+ {`a:1 c:2`, `a <-> (b|c)`, true, false, false},
+ {`a:1 d:2`, `a <-> (b|c)`, false, false, false},
+ {`a:1 b:2`, `a <-> (!b|c)`, false, false, true},
+ {`a:1 c:2`, `a <-> (!b|c)`, true, false, true},
+ {`a:1 d:2`, `a <-> (!b|c)`, true, false, true},
+ {`a:1 b:2`, `a <-> (b|!c)`, true, false, true},
+ {`a:1 c:2`, `a <-> (b|!c)`, false, false, true},
+ {`a:1 d:2`, `a <-> (b|!c)`, true, false, true},
+ {`a:1 b:2`, `a <-> (!b|!c)`, true, false, true},
+ {`a:1 c:2`, `a <-> (!b|!c)`, true, false, true},
+ {`a:1 d:2`, `a <-> (!b|!c)`, true, false, true},
+ {`a:1 b:2 c:3 d:4`, `a <-> ((b <-> c) | d)`, true, false, false},
+ {`a:1 b:2 c:3 d:4`, `a <-> (b | (c <-> d))`, true, false, false},
+ }
+
+ // runTest checks that evaluating `left @@ right` using keys from
+ // EncodeInvertedIndexKeys and spans from GetInvertedExpr
+ // produces the expected result.
+ // returns tight=true if the spans from GetInvertedExpr
+ // were tight, and tight=false otherwise.
+ runTest := func(left TSVector, right TSQuery, expected, expectUnique bool) (tight bool) {
+ keys, err := EncodeInvertedIndexKeys(nil, left)
+ assert.NoError(t, err)
+
+ invertedExpr, err := right.GetInvertedExpr()
+ assert.NoError(t, err)
+
+ spanExpr, ok := invertedExpr.(*inverted.SpanExpression)
+ assert.True(t, ok)
+
+ if spanExpr.Unique != expectUnique {
+ t.Errorf("For %s, expected unique=%v, but got %v", right, expectUnique, spanExpr.Unique)
+ }
+
+ actual, err := spanExpr.ContainsKeys(keys)
+ assert.NoError(t, err)
+
+ // There may be some false positives, so filter those out.
+ if actual && !spanExpr.Tight {
+ actual, err = EvalTSQuery(right, left)
+ assert.NoError(t, err)
+ }
+
+ if actual != expected {
+ if expected {
+ t.Errorf("expected %s to match %s but it did not", left.String(), right.String())
+ } else {
+ t.Errorf("expected %s not to match %s but it did", left.String(), right.String())
+ }
+ }
+
+ return spanExpr.Tight
+ }
+
+ // Run pre-defined test cases from above.
+ for _, c := range testCases {
+ indexedValue, err := ParseTSVector(c.vector)
+ assert.NoError(t, err)
+ query, err := ParseTSQuery(c.query)
+ assert.NoError(t, err)
+
+ // First check that evaluating `indexedValue @@ query` matches the expected
+ // result.
+ res, err := EvalTSQuery(query, indexedValue)
+ assert.NoError(t, err)
+ if res != c.expected {
+ t.Fatalf(
+ "expected value of %s @@ %s did not match actual value. Expected: %v. Got: %v",
+ c.vector, c.query, c.expected, res,
+ )
+ }
+
+ // Now check that we get the same result with the inverted index spans.
+ tight := runTest(indexedValue, query, c.expected, c.unique)
+
+ // And check that the tightness matches the expected value.
+ if tight != c.tight {
+ if c.tight {
+ t.Errorf("expected spans for %s to be tight but they were not", c.query)
+ } else {
+ t.Errorf("expected spans for %s not to be tight but they were", c.query)
+ }
+ }
+ }
+
+ // Run a set of randomly generated test cases.
+ rng, _ := randutil.NewTestRand()
+ for i := 0; i < 100; i++ {
+ // Generate a random query and vector and evaluate the result of `left @@ right`.
+ query := RandomTSQuery(rng)
+ vector := RandomTSVector(rng)
+
+ res, err := EvalTSQuery(query, vector)
+ assert.NoError(t, err)
+
+ invertedExpr, err := query.GetInvertedExpr()
+ if err != nil {
+ // We can't generate an inverted expression for this query, so there's
+ // nothing to test here.
+ continue
+ }
+ expectedUnique := invertedExpr.(*inverted.SpanExpression).Unique
+
+ // Now check that we get the same result with the inverted index spans.
+ runTest(vector, query, res, expectedUnique)
+ }
+}
diff --git a/pkg/util/tsearch/random.go b/pkg/util/tsearch/random.go
index 02107f8f1dd5..9fc8124a06e8 100644
--- a/pkg/util/tsearch/random.go
+++ b/pkg/util/tsearch/random.go
@@ -52,13 +52,38 @@ func RandomTSVector(rng *rand.Rand) TSVector {
// RandomTSQuery returns a random TSQuery for testing.
func RandomTSQuery(rng *rand.Rand) TSQuery {
- // TODO(jordan): make this a more robust random query generator
+ // TODO(jordan): add parenthesis grouping to the random query generator
+ nTerms := 1 + rng.Intn(5)
for {
- l := make([]byte, 1+rng.Intn(10))
- for i := range l {
- l[i] = alphabet[rng.Intn(len(alphabet))]
+ var sb strings.Builder
+ for i := 0; i < nTerms; i++ {
+ l := make([]byte, 1+rng.Intn(10))
+ for i := range l {
+ l[i] = alphabet[rng.Intn(len(alphabet))]
+ }
+ if rng.Intn(4) == 0 {
+ // Make it a negation query!
+ sb.WriteString("!")
+ }
+ sb.Write(l)
+ sb.WriteString(" ")
+ if i < nTerms-1 {
+ infixOp := rng.Intn(3)
+ var opstr string
+ switch infixOp {
+ case 0:
+ opstr = "&"
+ case 1:
+ opstr = "|"
+ case 2:
+ opstr = "<->"
+ }
+ sb.WriteString(opstr)
+ sb.WriteString(" ")
+ }
}
- query, err := ParseTSQuery(string(l))
+
+ query, err := ParseTSQuery(sb.String())
if err != nil {
continue
}
diff --git a/pkg/util/tsearch/tsquery.go b/pkg/util/tsearch/tsquery.go
index 296247af6583..8b8610b59384 100644
--- a/pkg/util/tsearch/tsquery.go
+++ b/pkg/util/tsearch/tsquery.go
@@ -14,6 +14,8 @@ import (
"fmt"
"strings"
+ "github.com/cockroachdb/cockroach/pkg/keysbase"
+ "github.com/cockroachdb/cockroach/pkg/sql/inverted"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/errors"
@@ -169,6 +171,105 @@ func (q TSQuery) String() string {
return q.root.String()
}
+// GetInvertedExpr returns the inverted expression that can be used to search
+// an index.
+func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) {
+ return q.root.getInvertedExpr()
+}
+
+func (n *tsNode) getInvertedExpr() (inverted.Expression, error) {
+ switch n.op {
+ case invalid:
+ // We're looking at a lexeme match.
+ // There are 3 options:
+ // 1. Normal match.
+ // In this case, we make a tight and unique span.
+ // 2. Prefix match.
+ // In this case, we make a non-unique, tight span that starts with the
+ // prefix.
+ // 3. Weighted match.
+ // In this case, we make the match non-tight, because we don't store the
+ // weights of the lexemes in the index, and are forced to re-check
+ // once we get the result from the inverted index.
+ // Note that options 2 and 3 can both be present.
+ var weight tsWeight
+ if len(n.term.positions) > 0 {
+ weight = n.term.positions[0].weight
+ }
+ key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme)
+ var span inverted.Span
+
+ prefixMatch := weight&weightStar != 0
+ if prefixMatch {
+ span = inverted.Span{
+ Start: key,
+ End: EncodeInvertedIndexKey(nil /* inKey */, string(keysbase.PrefixEnd([]byte(n.term.lexeme)))),
+ }
+ } else {
+ span = inverted.MakeSingleValSpan(key)
+ }
+ invertedExpr := inverted.ExprForSpan(span, true /* tight */)
+ if !prefixMatch {
+ // If we don't have a prefix match we also can set unique=true.
+ invertedExpr.Unique = true
+ }
+
+ if weight != 0 && weight != weightStar {
+ // Some weights are set.
+ invertedExpr.SetNotTight()
+ }
+ return invertedExpr, nil
+ case followedby:
+ fallthrough
+ case and:
+ l, lErr := n.l.getInvertedExpr()
+ r, rErr := n.r.getInvertedExpr()
+ if lErr != nil && rErr != nil {
+ // We need a positive match on at least one side.
+ return nil, lErr
+ } else if lErr != nil {
+ // An error on one side means we have to re-check that side's condition
+ // later.
+ r.SetNotTight()
+ //nolint:returnerrcheck
+ return r, nil
+ } else if rErr != nil {
+ // Ditto above.
+ l.SetNotTight()
+ //nolint:returnerrcheck
+ return l, nil
+ }
+ expr := inverted.And(l, r)
+ if n.op == followedby {
+ // If we have a followedby match, we have to re-check the results of the
+ // match after we get them from the inverted index - just because both
+ // terms are present doesn't mean they're properly next to each other,
+ // and the index doesn't store position information at all.
+ expr.SetNotTight()
+ }
+ return expr, nil
+ case or:
+ l, lErr := n.l.getInvertedExpr()
+ r, rErr := n.r.getInvertedExpr()
+ if lErr != nil {
+ // We need a positive match on both sides, so we return an error here.
+ // For example, searching for a | !b would require a full scan, since some
+ // documents could match that contain neither a nor b.
+ return nil, lErr
+ } else if rErr != nil {
+ return nil, rErr
+ }
+ return inverted.Or(l, r), nil
+ case not:
+ // A not would require more advanced machinery than we have, so for now
+ // we'll just assume we can't perform an inverted expression search on a
+ // not. Note that a nested not would make it possible, but we are ignoring
+ // this case for now as it seems marginal.
+ return nil, errors.New("unable to create inverted expr for not")
+ }
+ return nil, errors.AssertionFailedf("invalid operator %d", n.op)
+}
+
func lexTSQuery(input string) (TSVector, error) {
parser := tsVectorLexer{
input: input,
|