Skip to content

Commit

Permalink
sql: inv idx accelerate tsvector@@tsquery queries
Browse files Browse the repository at this point in the history
This commit adds inverted index acceleration for expressions that
evaluate a tsquery against a tsvector using the `@@` operator.

Release note (sql change): it's now possible to run efficient tsvector
@@ tsquery searches when there is an inverted index on the tsvector
column being searched.
  • Loading branch information
jordanlewis committed Jan 20, 2023
1 parent d07f41b commit 22f6553
Show file tree
Hide file tree
Showing 14 changed files with 800 additions and 11 deletions.
4 changes: 4 additions & 0 deletions pkg/sql/inverted/expression.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ type SpanExpression struct {
// JSON or Array SpanExpressions, and it holds when unique SpanExpressions
// are combined with And. It does not hold when these SpanExpressions are
// combined with Or.
//
// Note that the uniqueness property represented here holds for the *output*
// of the invertedFilter operator that executes the And/Or, not the raw, input
// spans of data.
Unique bool

// SpansToRead are the spans to read from the inverted index
Expand Down
96 changes: 96 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/tsvector
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,99 @@ query T
VALUES ( json_build_array($$'cat' & 'rat'$$:::TSQUERY)::JSONB)
----
["'cat' & 'rat'"]

# Test tsvector inverted indexes.
statement ok
DROP TABLE a;
CREATE TABLE a (
a TSVECTOR,
b TSQUERY,
INVERTED INDEX(a)
);
INSERT INTO a VALUES('foo:3 bar:4,5'), ('baz:1'), ('foo:3'), ('bar:2')

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo'
----
'bar':4,5 'foo':3
'foo':3

statement error index \"a_a_idx\" is inverted and cannot be used for this query
SELECT a FROM a@a_a_idx WHERE a @@ '!foo'

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo' OR a @@ 'bar'
----
'bar':4,5 'foo':3
'foo':3
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar'
----
'bar':4,5 'foo':3
'foo':3
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'foo | bar' OR a @@ 'baz'
----
'bar':4,5 'foo':3
'baz':1
'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo & bar'
----
'bar':4,5 'foo':3

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> bar'
----
'bar':4,5 'foo':3

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'bar <-> foo'
----

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo <-> !bar'
----
'foo':3

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ '!baz <-> bar'
----
'bar':4,5 'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'foo & !bar'
----
'foo':3

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:*'
----
'bar':4,5 'foo':3
'baz':1
'bar':2

query T rowsort
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* | foo'
----
'bar':4,5 'foo':3
'baz':1
'foo':3
'bar':2

query T
SELECT a FROM a@a_a_idx WHERE a @@ 'ba:* & foo'
----
'bar':4,5 'foo':3

# Test that tsvector indexes can't accelerate the @@ operator with no constant
# columns.
statement error index \"a_a_idx\" is inverted and cannot be used for this query
EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b
171 changes: 171 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/tsvector_index
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# LogicTest: local

statement ok
CREATE TABLE a (
a INT PRIMARY KEY,
b TSVECTOR,
c TSQUERY,
FAMILY (a,b,c),
INVERTED INDEX(b)
)

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'Foo'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo' OR b @@ 'bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo | bar' OR b @@ 'baz'
----
distribution: local
vectorized: true
·
• index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
missing stats
table: a@a_b_idx
spans: 3 spans

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & bar'
----
distribution: local
vectorized: true
·
• lookup join
│ table: a@a_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: b @@ '''foo'' & ''bar'''
└── • zigzag join
left table: a@a_b_idx
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: a@a_b_idx
right columns: (a, b_inverted_key)
right fixed values: 1 column

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo <-> bar'
----
distribution: local
vectorized: true
·
• lookup join
│ table: a@a_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: b @@ '''foo'' <-> ''bar'''
└── • zigzag join
left table: a@a_b_idx
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: a@a_b_idx
right columns: (a, b_inverted_key)
right fixed values: 1 column

query T
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ 'foo & !bar'
----
distribution: local
vectorized: true
·
• filter
│ filter: b @@ '''foo'' & !''bar'''
└── • index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span


query T
EXPLAIN SELECT a FROM a@a_b_idx WHERE b @@ 'ba:*'
----
distribution: local
vectorized: true
·
• inverted filter
│ inverted column: b_inverted_key
│ num spans: 1
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span


# Test that tsvector indexes can't accelerate the @@ operator with no constant
# columns.
statement error index \"a_b_idx\" is inverted and cannot be used for this query
EXPLAIN SELECT * FROM a@a_b_idx WHERE b @@ c
7 changes: 7 additions & 0 deletions pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pkg/sql/opt/invertedidx/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
"inverted_index_expr.go",
"json_array.go",
"trigram.go",
"tsearch.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/sql/opt/invertedidx",
visibility = ["//visibility:public"],
Expand Down Expand Up @@ -49,6 +50,7 @@ go_test(
"geo_test.go",
"json_array_test.go",
"trigram_test.go",
"tsearch_test.go",
],
args = ["-test.timeout=55s"],
deps = [
Expand Down
13 changes: 11 additions & 2 deletions pkg/sql/opt/invertedidx/inverted_index_expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,27 @@ func TryFilterInvertedIndex(
} else {
col := index.InvertedColumn().InvertedSourceColumnOrdinal()
typ = factory.Metadata().Table(tabID).Column(col).DatumType()
if typ.Family() == types.StringFamily {
switch typ.Family() {
case types.StringFamily:
filterPlanner = &trigramFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
} else {
case types.TSVectorFamily:
filterPlanner = &tsqueryFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
case types.JsonFamily, types.ArrayFamily:
filterPlanner = &jsonOrArrayFilterPlanner{
tabID: tabID,
index: index,
computedColumns: computedColumns,
}
default:
return nil, nil, nil, nil, false
}
}

Expand Down
Loading

0 comments on commit 22f6553

Please sign in to comment.