Skip to content

Commit

Permalink
Merge #93769
Browse files Browse the repository at this point in the history
93769: sql: add inverted indexing for tsvector and acceleration for @@ queries r=jordanlewis a=jordanlewis

Updates #41288 

This PR adds the ability to create an inverted index on a tsvector column, and adds inverted index acceleration for expressions that evaluate a tsquery against a tsvector using the ``@@`` operator.

Epic: CRDB-22357

Co-authored-by: Jordan Lewis <[email protected]>
  • Loading branch information
craig[bot] and jordanlewis committed Jan 21, 2023
2 parents 6f87274 + 5dd8ae2 commit b21379b
Show file tree
Hide file tree
Showing 22 changed files with 846 additions and 13 deletions.
2 changes: 2 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -3197,6 +3197,8 @@ active for the current transaction.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.num_inverted_index_entries"></a><code>crdb_internal.num_inverted_index_entries(val: jsonb, version: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>This function is used only by CockroachDB’s developers for testing purposes.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.num_inverted_index_entries"></a><code>crdb_internal.num_inverted_index_entries(val: tsvector, version: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>This function is used only by CockroachDB’s developers for testing purposes.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="crdb_internal.payloads_for_span"></a><code>crdb_internal.payloads_for_span(span_id: <a href="int.html">int</a>) &rarr; tuple{string AS payload_type, jsonb AS payload_jsonb}</code></td><td><span class="funcdesc"><p>Returns the payload(s) of the requested span and all its children.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.payloads_for_trace"></a><code>crdb_internal.payloads_for_trace(trace_id: <a href="int.html">int</a>) &rarr; tuple{int AS span_id, string AS payload_type, jsonb AS payload_jsonb}</code></td><td><span class="funcdesc"><p>Returns the payload(s) of the requested trace.</p>
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/catalog/colinfo/col_type_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ func ColumnTypeIsOnlyInvertedIndexable(t *types.T) bool {
case types.JsonFamily:
case types.GeographyFamily:
case types.GeometryFamily:
case types.TSVectorFamily:
default:
return false
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/create_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,12 @@ func populateInvertedIndexDescriptor(
return newUndefinedOpclassError(invCol.OpClass)
}
indexDesc.InvertedColumnKinds[0] = catpb.InvertedIndexColumnKind_TRIGRAM
case types.TSVectorFamily:
switch invCol.OpClass {
case "tsvector_ops", "":
default:
return newUndefinedOpclassError(invCol.OpClass)
}
default:
return tabledesc.NewInvalidInvertedColumnError(column.GetName(), column.GetType().Name())
}
Expand Down
16 changes: 11 additions & 5 deletions pkg/sql/inverted/expression.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,17 @@ type SpanExpression struct {
// Tight mirrors the definition of IsTight().
Tight bool

// Unique is true if the spans are guaranteed not to produce duplicate
// primary keys. Otherwise, Unique is false. Unique may be true for certain
// JSON or Array SpanExpressions, and it holds when unique SpanExpressions
// are combined with And. It does not hold when these SpanExpressions are
// combined with Or.
// Unique is true if the spans in FactoredUnionSpans are guaranteed not to
// produce duplicate primary keys. Otherwise, Unique is false. Unique may
// be true for certain JSON or Array SpanExpressions, and it holds when
// unique SpanExpressions are combined with And. It does not hold when
// these SpanExpressions are combined with Or.
//
// Once a SpanExpression is built, this field is relevant if the root
// SpanExpression has no children (i.e., Operator is None). In this case,
// Unique is used to determine whether an invertedFilter is needed on top
// of the inverted index scan to deduplicate keys (an invertedFilter is
// always necessary if Operator is not None).
Unique bool

// SpansToRead are the spans to read from the inverted index
Expand Down
96 changes: 96 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/tsvector
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,99 @@ query T
VALUES ( json_build_array($$'cat' & 'rat'$$:::TSQUERY)::JSONB)
----
["'cat' & 'rat'"]

# Test tsvector inverted indexes.
statement ok
DROP TABLE a;
CREATE TABLE a (
a TSVECTOR,
b TSQUERY,
INVERTED INDEX(a)
);
INSERT INTO a VALUES('foo:3 bar:4,5'), ('baz:1'), ('foo:3'), ('bar:2')

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo'
----
'bar':4,5 'foo':3
'foo':3

statement error index \"a_a_idx\" is inverted and cannot be used for this query
SELECT a FROM a@a_a_idx WHERE a @@ '!foo'

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo' OR a @@ 'bar'
----
'bar':4,5 'foo':3
'foo':3
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar'
----
'bar':4,5 'foo':3
'foo':3
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' OR a @@ 'baz'
----
'bar':4,5 'foo':3
'baz':1
'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo & bar'
----
'bar':4,5 'foo':3

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> bar'
----
'bar':4,5 'foo':3

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'bar <-> foo'
----

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> !bar'
----
'foo':3

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ '!baz <-> bar'
----
'bar':4,5 'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo & !bar'
----
'foo':3

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:*'
----
'bar':4,5 'foo':3
'baz':1
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* | foo'
----
'bar':4,5 'foo':3
'baz':1
'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* & foo'
----
'bar':4,5 'foo':3

# Test that tsvector indexes can't accelerate the @@ operator with no constant
# columns.
statement error index \"a_a_idx\" is inverted and cannot be used for this query
EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b
171 changes: 171 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# LogicTest: local

statement ok
CREATE TABLE a (
a INT PRIMARY KEY,
b TSVECTOR,
c TSQUERY,
FAMILY (a,b,c),
INVERTED INDEX(b)
)

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
missing stats
table: a@a_b_idx
spans: 3 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar'
----
distribution: local
vectorized: true
·
• lookup join
│ table: a@a_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: b @@ '''foo'' & ''bar'''
└── • zigzag join
left table: a@a_b_idx
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: a@a_b_idx
right columns: (a, b_inverted_key)
right fixed values: 1 column

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar'
----
distribution: local
vectorized: true
·
• lookup join
│ table: a@a_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: b @@ '''foo'' <-> ''bar'''
└── • zigzag join
left table: a@a_b_idx
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: a@a_b_idx
right columns: (a, b_inverted_key)
right fixed values: 1 column

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar'
----
distribution: local
vectorized: true
·
• filter
│ filter: b @@ '''foo'' & !''bar'''
└── • index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span


query T
EXPLAIN SELECT a FROM a@a_b_idx WHERE b @@ 'ba:*'
----
distribution: local
vectorized: true
·
• inverted filter
│ inverted column: b_inverted_key
│ num spans: 1
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span


# Test that tsvector indexes can't accelerate the @@ operator with no constant
# columns.
statement error index \"a_b_idx\" is inverted and cannot be used for this query
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c
7 changes: 7 additions & 0 deletions pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pkg/sql/opt/invertedidx/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
"inverted_index_expr.go",
"json_array.go",
"trigram.go",
"tsearch.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx",
visibility = ["//visibility:public"],
Expand Down Expand Up @@ -49,6 +50,7 @@ go_test(
"geo_test.go",
"json_array_test.go",
"trigram_test.go",
"tsearch_test.go",
],
args = ["-test.timeout=55s"],
deps = [
Expand Down
13 changes: 11 additions & 2 deletions pkg/sql/opt/invertedidx/inverted_index_expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,27 @@ func TryFilterInvertedIndex(
} else {
col := index.InvertedColumn().InvertedSourceColumnOrdinal()
typ = factory.Metadata().Table(tabID).Column(col).DatumType()
if typ.Family() == types.StringFamily {
switch typ.Family() {
case types.StringFamily:
filterPlanner = &trigramFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
} else {
case types.TSVectorFamily:
filterPlanner = &tsqueryFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
case types.JsonFamily, types.ArrayFamily:
filterPlanner = &jsonOrArrayFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
default:
return nil, nil, nil, nil, false
}
}

Expand Down
Loading

0 comments on commit b21379b

Please sign in to comment.