Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
92966: builtins: add to_tsvector {phrase,plain,}to_tsquery r=jordanlewis a=jordanlewis

Updates: cockroachdb#41288
Epic: CRDB-22357

Release note (sql change): add the to_tsvector, to_tsquery, phraseto_tsquery, and plainto_tsquery builtins which parse input documents into tsvectors and tsqueries respectively.

Co-authored-by: Jordan Lewis <[email protected]>
  • Loading branch information
craig[bot] and jordanlewis committed Feb 28, 2023
2 parents b08d705 + 67179af commit 3ed2c8f
Show file tree
Hide file tree
Showing 9 changed files with 459 additions and 4 deletions.
17 changes: 17 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,23 @@ available replica will error.</p>
</span></td><td>Immutable</td></tr></tbody>
</table>

### Full Text Search functions

<table>
<thead><tr><th>Function &rarr; Returns</th><th>Description</th><th>Volatility</th></tr></thead>
<tbody>
<tr><td><a name="phraseto_tsquery"></a><code>phraseto_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts text to a tsquery, normalizing words according to the specified or default configuration. The &lt;-&gt; operator is inserted between each token in the input.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="plainto_tsquery"></a><code>plainto_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts text to a tsquery, normalizing words according to the specified or default configuration. The &amp; operator is inserted between each token in the input.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="to_tsquery"></a><code>to_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts the input text into a tsquery by normalizing each word in the input according to the specified or default configuration. The input must already be formatted like a tsquery, in other words, subsequent tokens must be connected by a tsquery operator (&amp;, |, &lt;-&gt;, !).</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="to_tsvector"></a><code>to_tsvector(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsvector</code></td><td><span class="funcdesc"><p>Converts text to a tsvector, normalizing words according to the specified or default configuration. Position information is included in the result.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="ts_parse"></a><code>ts_parse(parser_name: <a href="string.html">string</a>, document: <a href="string.html">string</a>) &rarr; tuple{int AS tokid, string AS token}</code></td><td><span class="funcdesc"><p>ts_parse parses the given document and returns a series of records, one for each token produced by parsing. Each record includes a tokid showing the assigned token type and a token which is the text of the token.</p>
</span></td><td>Stable</td></tr></tbody>
</table>

### Fuzzy String Matching functions

<table>
Expand Down
35 changes: 35 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/tsvector
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,38 @@ EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b
statement ok
CREATE TABLE t95680 (c1 FLOAT NOT NULL, c2 TSVECTOR NOT NULL, INVERTED INDEX (c1 ASC, c2 ASC));
INSERT INTO t95680 VALUES (1.0::FLOAT, e'\'kCrLZNl\' \'sVDj\' \'yO\' \'z\':54C,440B,519C,794B':::TSVECTOR);

# More tests for these functions live in pkg/util/tsearch/testdata
query IT
SELECT * FROM ts_parse('default', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
1 Hello
1 this
1 is
1 a
1 parsi
1 ng
1 t
1 est
1 1
1 234
1 4
1 case324

query T
SELECT * FROM to_tsvector('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
'1':9 '234':10 '4':11 'a':4 'case324':12 'est':8 'hello':1 'is':3 'ng':6 'parsi':5 't':7 'this':2

query T
SELECT * FROM phraseto_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
'hello' <-> 'this' <-> 'is' <-> 'a' <-> 'parsi' <-> 'ng' <-> 't' <-> 'est' <-> '1' <-> '234' <-> '4' <-> 'case324'

query T
SELECT * FROM to_tsquery('simple', 'a | b & c <-> d')
----
'a' | 'b' & 'c' <-> 'd'

query error syntax
SELECT * FROM to_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
2 changes: 2 additions & 0 deletions pkg/sql/sem/builtins/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_library(
"show_create_all_tables_builtin.go",
"show_create_all_types_builtin.go",
"trigram_builtins.go",
"tsearch_builtins.go",
"window_builtins.go",
"window_frame_builtins.go",
],
Expand Down Expand Up @@ -126,6 +127,7 @@ go_library(
"//pkg/util/tracing",
"//pkg/util/tracing/tracingpb",
"//pkg/util/trigram",
"//pkg/util/tsearch",
"//pkg/util/ulid",
"//pkg/util/unaccent",
"//pkg/util/uuid",
Expand Down
4 changes: 0 additions & 4 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -3791,13 +3791,9 @@ value if you rely on the HLC for accuracy.`,
"array_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"get_current_ts_config": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"numnode": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"plainto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"phraseto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"querytree": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"setweight": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"strip": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"to_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"json_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"jsonb_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_delete": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
Expand Down
5 changes: 5 additions & 0 deletions pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -2045,6 +2045,11 @@ var builtinOidsArray = []string{
2069: `crdb_internal.create_tenant(parameters: jsonb) -> int`,
2070: `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`,
2072: `crdb_internal.upsert_dropped_relation_gc_ttl(desc_id: int, gc_ttl: interval) -> bool`,
2073: `to_tsquery(config: string, text: string) -> tsquery`,
2074: `to_tsvector(config: string, text: string) -> tsvector`,
2075: `phraseto_tsquery(config: string, text: string) -> tsquery`,
2076: `plainto_tsquery(config: string, text: string) -> tsquery`,
2077: `ts_parse(parser_name: string, document: string) -> tuple{int AS tokid, string AS token}`,
}

var builtinOidsBySignature map[string]oid.Oid
Expand Down
169 changes: 169 additions & 0 deletions pkg/sql/sem/builtins/tsearch_builtins.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package builtins

import (
"context"

"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins/builtinconstants"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sem/volatility"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/tsearch"
)

func init() {
for k, v := range tsearchBuiltins {
v.props.Category = builtinconstants.CategoryFullTextSearch
v.props.AvailableOnPublicSchema = true
registerBuiltin(k, v)
}
}

type tsParseGenerator struct {
input string
tokens []string
nextToken string
}

func (t tsParseGenerator) ResolvedType() *types.T {
return tsParseType
}

func (t *tsParseGenerator) Start(_ context.Context, _ *kv.Txn) error {
t.tokens = tsearch.TSParse(t.input)
return nil
}

func (t *tsParseGenerator) Next(_ context.Context) (bool, error) {
if len(t.tokens) == 0 {
return false, nil
}
t.nextToken, t.tokens = t.tokens[0], t.tokens[1:]
return true, nil
}

func (t tsParseGenerator) Values() (tree.Datums, error) {
return tree.Datums{tree.NewDInt(1), tree.NewDString(t.nextToken)}, nil
}

func (t tsParseGenerator) Close(_ context.Context) {}

var tsParseType = types.MakeLabeledTuple(
[]*types.T{types.Int, types.String},
[]string{"tokid", "token"},
)

var tsearchBuiltins = map[string]builtinDefinition{
"ts_parse": makeBuiltin(genProps(),
makeGeneratorOverload(
tree.ParamTypes{{Name: "parser_name", Typ: types.String}, {Name: "document", Typ: types.String}},
types.MakeLabeledTuple(
[]*types.T{types.Int, types.String},
[]string{"tokid", "token"},
),
func(_ context.Context, _ *eval.Context, args tree.Datums) (eval.ValueGenerator, error) {
parserName := string(tree.MustBeDString(args[0]))
if parserName != "default" {
return nil, pgerror.Newf(pgcode.UndefinedObject, "text search parser %q does not exist", parserName)
}
return &tsParseGenerator{input: string(tree.MustBeDString(args[1]))}, nil
},
"ts_parse parses the given document and returns a series of records, "+
"one for each token produced by parsing. "+
"Each record includes a tokid showing the assigned token type and a token which is the text of the token.",
volatility.Stable,
),
),
// Full text search functions.
"to_tsvector": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSVector),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
// Parse, stem, and stopword the input.
config := string(tree.MustBeDString(args[0]))
document := string(tree.MustBeDString(args[1]))
vector, err := tsearch.DocumentToTSVector(config, document)
if err != nil {
return nil, err
}
return &tree.DTSVector{TSVector: vector}, nil
},
Info: "Converts text to a tsvector, normalizing words according to the specified or default configuration. " +
"Position information is included in the result.",
Volatility: volatility.Immutable,
},
),
"to_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.ToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts the input text into a tsquery by normalizing each word in the input according to " +
"the specified or default configuration. The input must already be formatted like a tsquery, in other words, " +
"subsequent tokens must be connected by a tsquery operator (&, |, <->, !).",
Volatility: volatility.Immutable,
},
),
"plainto_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.PlainToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." +
" The & operator is inserted between each token in the input.",
Volatility: volatility.Immutable,
},
),
"phraseto_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.PhraseToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." +
" The <-> operator is inserted between each token in the input.",
Volatility: volatility.Immutable,
},
),
}
45 changes: 45 additions & 0 deletions pkg/sql/sem/eval/testdata/eval/tsearch
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,48 @@ eval
'bar:3 baz:5'::tsvector @@ 'baz <2> bar'::tsquery
----
false

eval
to_tsvector('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.')
----
'and':12 'events':9 'find':15 'have':2 'mind':6 'not':7 'outside':8 'over':4 'power':3 'realize':10 'strength':16 'this':11 'will':14 'you':1,13 'your':5

eval
to_tsquery('simple', 'hello')
----
'hello'

eval
to_tsquery('simple', 'hello | there')
----
'hello' | 'there'

eval
to_tsquery('simple', 'hello | the#re')
----
'hello' | 'the' <-> 're'

eval
plainto_tsquery('simple', 'hello there')
----
'hello' & 'there'

eval
plainto_tsquery('simple', 'hello the#re')
----
'hello' & 'the' & 're'

eval
phraseto_tsquery('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.')
----
'you' <-> 'have' <-> 'power' <-> 'over' <-> 'your' <-> 'mind' <-> 'not' <-> 'outside' <-> 'events' <-> 'realize' <-> 'this' <-> 'and' <-> 'you' <-> 'will' <-> 'find' <-> 'strength'

eval
phraseto_tsquery('simple', 'hello there')
----
'hello' <-> 'there'

eval
phraseto_tsquery('simple', 'hello the#re')
----
'hello' <-> 'the' <-> 're'
Loading

0 comments on commit 3ed2c8f

Please sign in to comment.