diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 12f3f5a85bc7..97a71432d98e 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -883,6 +883,23 @@ available replica will error.

Immutable +### Full Text Search functions + + + + + + + + + +
Function → ReturnsDescriptionVolatility
phraseto_tsquery(config: string, text: string) → tsquery

Converts text to a tsquery, normalizing words according to the specified or default configuration. The <-> operator is inserted between each token in the input.

+
Immutable
plainto_tsquery(config: string, text: string) → tsquery

Converts text to a tsquery, normalizing words according to the specified or default configuration. The & operator is inserted between each token in the input.

+
Immutable
to_tsquery(config: string, text: string) → tsquery

Converts the input text into a tsquery by normalizing each word in the input according to the specified or default configuration. The input must already be formatted like a tsquery, in other words, subsequent tokens must be connected by a tsquery operator (&, |, <->, !).

+
Immutable
to_tsvector(config: string, text: string) → tsvector

Converts text to a tsvector, normalizing words according to the specified or default configuration. Position information is included in the result.

+
Immutable
ts_parse(parser_name: string, document: string) → tuple{int AS tokid, string AS token}

ts_parse parses the given document and returns a series of records, one for each token produced by parsing. Each record includes a tokid showing the assigned token type and a token which is the text of the token.

+
Stable
+ ### Fuzzy String Matching functions diff --git a/pkg/sql/logictest/testdata/logic_test/tsvector b/pkg/sql/logictest/testdata/logic_test/tsvector index d080df2d04aa..1c1f096ec96c 100644 --- a/pkg/sql/logictest/testdata/logic_test/tsvector +++ b/pkg/sql/logictest/testdata/logic_test/tsvector @@ -240,3 +240,38 @@ EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b statement ok CREATE TABLE t95680 (c1 FLOAT NOT NULL, c2 TSVECTOR NOT NULL, INVERTED INDEX (c1 ASC, c2 ASC)); INSERT INTO t95680 VALUES (1.0::FLOAT, e'\'kCrLZNl\' \'sVDj\' \'yO\' \'z\':54C,440B,519C,794B':::TSVECTOR); + +# More tests for these functions live in pkg/util/tsearch/testdata +query IT +SELECT * FROM ts_parse('default', 'Hello this is a parsi-ng t.est 1.234 4 case324') +---- +1 Hello +1 this +1 is +1 a +1 parsi +1 ng +1 t +1 est +1 1 +1 234 +1 4 +1 case324 + +query T +SELECT * FROM to_tsvector('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324') +---- +'1':9 '234':10 '4':11 'a':4 'case324':12 'est':8 'hello':1 'is':3 'ng':6 'parsi':5 't':7 'this':2 + +query T +SELECT * FROM phraseto_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324') +---- +'hello' <-> 'this' <-> 'is' <-> 'a' <-> 'parsi' <-> 'ng' <-> 't' <-> 'est' <-> '1' <-> '234' <-> '4' <-> 'case324' + +query T +SELECT * FROM to_tsquery('simple', 'a | b & c <-> d') +---- +'a' | 'b' & 'c' <-> 'd' + +query error syntax +SELECT * FROM to_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324') diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel index 07a9f086a962..41b764f4d186 100644 --- a/pkg/sql/sem/builtins/BUILD.bazel +++ b/pkg/sql/sem/builtins/BUILD.bazel @@ -24,6 +24,7 @@ go_library( "show_create_all_tables_builtin.go", "show_create_all_types_builtin.go", "trigram_builtins.go", + "tsearch_builtins.go", "window_builtins.go", "window_frame_builtins.go", ], @@ -126,6 +127,7 @@ go_library( "//pkg/util/tracing", "//pkg/util/tracing/tracingpb", "//pkg/util/trigram", + "//pkg/util/tsearch", "//pkg/util/ulid", "//pkg/util/unaccent", "//pkg/util/uuid", diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 08f89d47ca95..cf083f36475d 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -3791,13 +3791,9 @@ value if you rely on the HLC for accuracy.`, "array_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "get_current_ts_config": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "numnode": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), - "plainto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), - "phraseto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "querytree": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "setweight": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "strip": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), - "to_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), - "to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "json_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "jsonb_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), "ts_delete": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}), diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 1e1ae1faaaa8..42df35e3e48f 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2045,6 +2045,11 @@ var builtinOidsArray = []string{ 2069: `crdb_internal.create_tenant(parameters: jsonb) -> int`, 2070: `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`, 2072: `crdb_internal.upsert_dropped_relation_gc_ttl(desc_id: int, gc_ttl: interval) -> bool`, + 2073: `to_tsquery(config: string, text: string) -> tsquery`, + 2074: `to_tsvector(config: string, text: string) -> tsvector`, + 2075: `phraseto_tsquery(config: string, text: string) -> tsquery`, + 2076: `plainto_tsquery(config: string, text: string) -> tsquery`, + 2077: `ts_parse(parser_name: string, document: string) -> tuple{int AS tokid, string AS token}`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/sql/sem/builtins/tsearch_builtins.go b/pkg/sql/sem/builtins/tsearch_builtins.go new file mode 100644 index 000000000000..e8e71ee7be98 --- /dev/null +++ b/pkg/sql/sem/builtins/tsearch_builtins.go @@ -0,0 +1,169 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package builtins + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/sql/sem/builtins/builtinconstants" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sem/volatility" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util/tsearch" +) + +func init() { + for k, v := range tsearchBuiltins { + v.props.Category = builtinconstants.CategoryFullTextSearch + v.props.AvailableOnPublicSchema = true + registerBuiltin(k, v) + } +} + +type tsParseGenerator struct { + input string + tokens []string + nextToken string +} + +func (t tsParseGenerator) ResolvedType() *types.T { + return tsParseType +} + +func (t *tsParseGenerator) Start(_ context.Context, _ *kv.Txn) error { + t.tokens = tsearch.TSParse(t.input) + return nil +} + +func (t *tsParseGenerator) Next(_ context.Context) (bool, error) { + if len(t.tokens) == 0 { + return false, nil + } + t.nextToken, t.tokens = t.tokens[0], t.tokens[1:] + return true, nil +} + +func (t tsParseGenerator) Values() (tree.Datums, error) { + return tree.Datums{tree.NewDInt(1), tree.NewDString(t.nextToken)}, nil +} + +func (t tsParseGenerator) Close(_ context.Context) {} + +var tsParseType = types.MakeLabeledTuple( + []*types.T{types.Int, types.String}, + []string{"tokid", "token"}, +) + +var tsearchBuiltins = map[string]builtinDefinition{ + "ts_parse": makeBuiltin(genProps(), + makeGeneratorOverload( + tree.ParamTypes{{Name: "parser_name", Typ: types.String}, {Name: "document", Typ: types.String}}, + types.MakeLabeledTuple( + []*types.T{types.Int, types.String}, + []string{"tokid", "token"}, + ), + func(_ context.Context, _ *eval.Context, args tree.Datums) (eval.ValueGenerator, error) { + parserName := string(tree.MustBeDString(args[0])) + if parserName != "default" { + return nil, pgerror.Newf(pgcode.UndefinedObject, "text search parser %q does not exist", parserName) + } + return &tsParseGenerator{input: string(tree.MustBeDString(args[1]))}, nil + }, + "ts_parse parses the given document and returns a series of records, "+ + "one for each token produced by parsing. "+ + "Each record includes a tokid showing the assigned token type and a token which is the text of the token.", + volatility.Stable, + ), + ), + // Full text search functions. + "to_tsvector": makeBuiltin( + tree.FunctionProperties{}, + tree.Overload{ + Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.TSVector), + Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + // Parse, stem, and stopword the input. + config := string(tree.MustBeDString(args[0])) + document := string(tree.MustBeDString(args[1])) + vector, err := tsearch.DocumentToTSVector(config, document) + if err != nil { + return nil, err + } + return &tree.DTSVector{TSVector: vector}, nil + }, + Info: "Converts text to a tsvector, normalizing words according to the specified or default configuration. " + + "Position information is included in the result.", + Volatility: volatility.Immutable, + }, + ), + "to_tsquery": makeBuiltin( + tree.FunctionProperties{}, + tree.Overload{ + Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.TSQuery), + Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + config := string(tree.MustBeDString(args[0])) + input := string(tree.MustBeDString(args[1])) + query, err := tsearch.ToTSQuery(config, input) + if err != nil { + return nil, err + } + return &tree.DTSQuery{TSQuery: query}, nil + }, + Info: "Converts the input text into a tsquery by normalizing each word in the input according to " + + "the specified or default configuration. The input must already be formatted like a tsquery, in other words, " + + "subsequent tokens must be connected by a tsquery operator (&, |, <->, !).", + Volatility: volatility.Immutable, + }, + ), + "plainto_tsquery": makeBuiltin( + tree.FunctionProperties{}, + tree.Overload{ + Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.TSQuery), + Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + config := string(tree.MustBeDString(args[0])) + input := string(tree.MustBeDString(args[1])) + query, err := tsearch.PlainToTSQuery(config, input) + if err != nil { + return nil, err + } + return &tree.DTSQuery{TSQuery: query}, nil + }, + Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." + + " The & operator is inserted between each token in the input.", + Volatility: volatility.Immutable, + }, + ), + "phraseto_tsquery": makeBuiltin( + tree.FunctionProperties{}, + tree.Overload{ + Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.TSQuery), + Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + config := string(tree.MustBeDString(args[0])) + input := string(tree.MustBeDString(args[1])) + query, err := tsearch.PhraseToTSQuery(config, input) + if err != nil { + return nil, err + } + return &tree.DTSQuery{TSQuery: query}, nil + }, + Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." + + " The <-> operator is inserted between each token in the input.", + Volatility: volatility.Immutable, + }, + ), +} diff --git a/pkg/sql/sem/eval/testdata/eval/tsearch b/pkg/sql/sem/eval/testdata/eval/tsearch index e259a326fffd..a2b8087d672e 100644 --- a/pkg/sql/sem/eval/testdata/eval/tsearch +++ b/pkg/sql/sem/eval/testdata/eval/tsearch @@ -60,3 +60,48 @@ eval 'bar:3 baz:5'::tsvector @@ 'baz <2> bar'::tsquery ---- false + +eval +to_tsvector('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.') +---- +'and':12 'events':9 'find':15 'have':2 'mind':6 'not':7 'outside':8 'over':4 'power':3 'realize':10 'strength':16 'this':11 'will':14 'you':1,13 'your':5 + +eval +to_tsquery('simple', 'hello') +---- +'hello' + +eval +to_tsquery('simple', 'hello | there') +---- +'hello' | 'there' + +eval +to_tsquery('simple', 'hello | the#re') +---- +'hello' | 'the' <-> 're' + +eval +plainto_tsquery('simple', 'hello there') +---- +'hello' & 'there' + +eval +plainto_tsquery('simple', 'hello the#re') +---- +'hello' & 'the' & 're' + +eval +phraseto_tsquery('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.') +---- +'you' <-> 'have' <-> 'power' <-> 'over' <-> 'your' <-> 'mind' <-> 'not' <-> 'outside' <-> 'events' <-> 'realize' <-> 'this' <-> 'and' <-> 'you' <-> 'will' <-> 'find' <-> 'strength' + +eval +phraseto_tsquery('simple', 'hello there') +---- +'hello' <-> 'there' + +eval +phraseto_tsquery('simple', 'hello the#re') +---- +'hello' <-> 'the' <-> 're' diff --git a/pkg/util/tsearch/tsquery.go b/pkg/util/tsearch/tsquery.go index 8b8610b59384..5200e8947cb2 100644 --- a/pkg/util/tsearch/tsquery.go +++ b/pkg/util/tsearch/tsquery.go @@ -81,6 +81,24 @@ func (o tsOperator) pgwireEncoding() byte { panic(errors.AssertionFailedf("no pgwire encoding for operator %d", o)) } +func (o tsOperator) String() string { + switch o { + case not: + return "!" + case and: + return "&" + case or: + return "|" + case followedby: + return "<->" + case lparen: + return "(" + case rparen: + return ")" + } + panic(errors.AssertionFailedf("no string for operator %d", o)) +} + func tsOperatorFromPgwireEncoding(b byte) (tsOperator, error) { switch b { case 1: @@ -405,3 +423,107 @@ func (p *tsQueryParser) parseTSExpr(minBindingPower int) (*tsNode, error) { func (p *tsQueryParser) syntaxError() (*tsNode, error) { return nil, pgerror.Newf(pgcode.Syntax, "syntax error in TSQuery: %s", p.input) } + +// ToTSQuery implements the to_tsquery builtin, which lexes an input, performs +// stopwording and normalization on the tokens, and returns a parsed query. +func ToTSQuery(config string, input string) (TSQuery, error) { + return toTSQuery(config, invalid, input) +} + +// PlainToTSQuery implements the plainto_tsquery builtin, which lexes an input, +// performs stopwording and normalization on the tokens, and returns a parsed +// query, interposing the & operator between each token. +func PlainToTSQuery(config string, input string) (TSQuery, error) { + return toTSQuery(config, and, input) +} + +// PhraseToTSQuery implements the phraseto_tsquery builtin, which lexes an input, +// performs stopwording and normalization on the tokens, and returns a parsed +// query, interposing the <-> operator between each token. +func PhraseToTSQuery(config string, input string) (TSQuery, error) { + return toTSQuery(config, followedby, input) +} + +// toTSQuery implements the to_tsquery builtin, which lexes an input, +// performs stopwording and normalization on the tokens, and returns a parsed +// query. If the interpose operator is not invalid, it's interposed between each +// token in the input. +func toTSQuery(config string, interpose tsOperator, input string) (TSQuery, error) { + switch config { + case "simple": + default: + return TSQuery{}, pgerror.Newf(pgcode.UndefinedObject, "text search configuration %q does not exist", config) + } + + vector, err := lexTSQuery(input) + if err != nil { + return TSQuery{}, err + } + tokens := make(TSVector, 0, len(vector)) + for i := range vector { + tok := vector[i] + + foundOperator := tok.operator != invalid + var lexemeTokens []string + + if !foundOperator { + // Try parsing the token. + lexemeTokens = TSParse(tok.lexeme) + } + + // If we found an operator or were able to parse lexemes from the token, + // add the interpose operator if there is one. + if interpose != invalid && i > 0 && (foundOperator || len(lexemeTokens) > 0) { + term := tsTerm{operator: interpose} + if interpose == followedby { + term.followedN = 1 + } + tokens = append(tokens, term) + } + + if foundOperator { + tokens = append(tokens, tok) + continue + } + + if len(lexemeTokens) == 0 { + // We ate some whitespace or whitespace-like text with no tokens. + continue + } + + // When we support more than just the simple configuration, we'll also + // want to remove stopwords, which will affect the interposing, but we can + // worry about that later. + // Additionally, if we're doing phraseto_tsquery, if we remove a stopword, + // we need to make sure to increase the "followedN" of the followedby + // operator. For example, phraseto_tsquery('hello a deer') will return + // 'hello <2> deer', since the a stopword would be removed. + + tokInterpose := interpose + if tokInterpose == invalid { + tokInterpose = followedby + } + for j := range lexemeTokens { + if j > 0 { + // We found more than one lexeme in our token, so we need to add all of them + // to the query, connected by our interpose operator. + // If we aren't running with an interpose, like in to_tsquery, Postgres + // uses the <-> operator to connect multiple lexemes from a single token. + term := tsTerm{operator: tokInterpose} + if tokInterpose == followedby { + term.followedN = 1 + } + tokens = append(tokens, term) + } + lexeme, err := TSLexize(config, lexemeTokens[j]) + if err != nil { + return TSQuery{}, err + } + tokens = append(tokens, tsTerm{lexeme: lexeme}) + } + } + + // Now create the operator tree. + queryParser := tsQueryParser{terms: tokens, input: input} + return queryParser.parse() +} diff --git a/pkg/util/tsearch/tsvector.go b/pkg/util/tsearch/tsvector.go index 7f3f51333b41..073269b057df 100644 --- a/pkg/util/tsearch/tsvector.go +++ b/pkg/util/tsearch/tsvector.go @@ -15,6 +15,7 @@ import ( "sort" "strconv" "strings" + "unicode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" @@ -256,6 +257,10 @@ func ParseTSVector(input string) (TSVector, error) { return ret, err } + return normalizeTSVector(ret) +} + +func normalizeTSVector(ret TSVector) (TSVector, error) { if len(ret) > 1 { // Sort and de-duplicate the resultant TSVector. sort.Slice(ret, func(i, j int) bool { @@ -289,3 +294,62 @@ func ParseTSVector(input string) (TSVector, error) { } return ret, nil } + +var validCharTables = []*unicode.RangeTable{unicode.Letter, unicode.Number} + +// TSParse is the function that splits an input text into a list of +// tokens. For now, the parser that we use is very simple: it merely lowercases +// the input and splits it into tokens based on assuming that non-letter, +// non-number characters are whitespace. +// +// The Postgres text search parser is much, much more sophisticated. The +// documentation (https://www.postgresql.org/docs/current/textsearch-parsers.html) +// gives more information, but roughly, each token is categorized into one of +// about 20 different buckets, such as asciiword, url, email, host, float, int, +// version, tag, etc. It uses very specific rules to produce these outputs. +// Another interesting transformation is returning multiple tokens for a +// hyphenated word, including a token that represents the entire hyphenated word, +// as well as one for each of the hyphenated components. +// +// It's not clear whether we need to exactly mimic this functionality. Likely, +// we will eventually want to do this. +func TSParse(input string) []string { + return strings.FieldsFunc(input, func(r rune) bool { + return !unicode.IsOneOf(validCharTables, r) + }) +} + +// TSLexize implements the "dictionary" construct that's exposed via ts_lexize. +// It gets invoked once per input token to produce an output lexeme during +// routines like to_tsvector and to_tsquery. +func TSLexize(config string, token string) (lexeme string, err error) { + if config != "simple" { + return "", pgerror.Newf(pgcode.UndefinedObject, "text search configuration %q does not exist", config) + } + return strings.ToLower(token), nil +} + +// DocumentToTSVector parses an input document into lexemes, removes stop words, +// stems and normalizes the lexemes, and returns a TSVector annotated with +// lexeme positions according to a text search configuration passed by name. +func DocumentToTSVector(config string, input string) (TSVector, error) { + if config != "simple" { + return nil, pgerror.Newf(pgcode.UndefinedObject, "text search configuration %q does not exist", config) + } + + tokens := TSParse(input) + vector := make(TSVector, len(tokens)) + for i := range tokens { + lexeme, err := TSLexize(config, tokens[i]) + if err != nil { + return nil, err + } + vector[i].lexeme = lexeme + pos := i + 1 + if i > maxTSVectorPosition { + pos = maxTSVectorPosition + } + vector[i].positions = []tsPosition{{position: uint16(pos)}} + } + return normalizeTSVector(vector) +}