Skip to content

Commit

Permalink
tsearch: add ts_rank functionality
Browse files Browse the repository at this point in the history
This commit adds ts_rank, the family of builtins that allow ranking of
text search results. The function takes a tsquery and a tsvector and
returns a float that indicates how good the match is. The function can
be modified by passing in a custom array of weights that matches the
text search weights A, B, C, and D, and a bitmask that controls the
ranking behavior in various detailed ways.

See the excellent Postgres documentation here for details:
https://www.postgresql.org/docs/current/textsearch-controls.html

Release note (sql change): add the ts_rank function for ranking text
search query results
  • Loading branch information
jordanlewis committed Mar 22, 2023
1 parent 8ca25df commit f1ce8cf
Show file tree
Hide file tree
Showing 10 changed files with 528 additions and 17 deletions.
10 changes: 9 additions & 1 deletion docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,15 @@ available replica will error.</p>
<tr><td><a name="to_tsvector"></a><code>to_tsvector(text: <a href="string.html">string</a>) &rarr; tsvector</code></td><td><span class="funcdesc"><p>Converts text to a tsvector, normalizing words according to the default configuration. Position information is included in the result.</p>
</span></td><td>Stable</td></tr>
<tr><td><a name="ts_parse"></a><code>ts_parse(parser_name: <a href="string.html">string</a>, document: <a href="string.html">string</a>) &rarr; tuple{int AS tokid, string AS token}</code></td><td><span class="funcdesc"><p>ts_parse parses the given document and returns a series of records, one for each token produced by parsing. Each record includes a tokid showing the assigned token type and a token which is the text of the token.</p>
</span></td><td>Stable</td></tr></tbody>
</span></td><td>Stable</td></tr>
<tr><td><a name="ts_rank"></a><code>ts_rank(vector: tsvector, query: tsquery) &rarr; float4</code></td><td><span class="funcdesc"><p>Ranks vectors based on the frequency of their matching lexemes.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="ts_rank"></a><code>ts_rank(vector: tsvector, query: tsquery, normalization: <a href="int.html">int</a>) &rarr; float4</code></td><td><span class="funcdesc"><p>Ranks vectors based on the frequency of their matching lexemes.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="ts_rank"></a><code>ts_rank(weights: <a href="float.html">float</a>[], vector: tsvector, query: tsquery) &rarr; float4</code></td><td><span class="funcdesc"><p>Ranks vectors based on the frequency of their matching lexemes.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="ts_rank"></a><code>ts_rank(weights: <a href="float.html">float</a>[], vector: tsvector, query: tsquery, normalization: <a href="int.html">int</a>) &rarr; float4</code></td><td><span class="funcdesc"><p>Ranks vectors based on the frequency of their matching lexemes.</p>
</span></td><td>Immutable</td></tr></tbody>
</table>

### Fuzzy String Matching functions
Expand Down
36 changes: 36 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/tsvector
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,39 @@ query T
SELECT to_tsvector('Hello I am a potato')
----
'am':3 'hell':1 'i':2 'potat':5

query TT
SELECT to_tsvector('english', ''), to_tsvector('english', 'and the')
----
· ·

statement error doesn't contain lexemes
SELECT to_tsquery('english', 'the')

statement ok
CREATE TABLE sentences (sentence text, v TSVECTOR AS (to_tsvector('english', sentence)) STORED, INVERTED INDEX (v));
INSERT INTO sentences VALUES
('Future users of large data banks must be protected from having to know how the data is organized in the machine (the internal representation).'),
('A prompting service which supplies such information is not a satisfactory solution.'),
('Activities of users at terminals and most application programs should remain unaffected when the internal representation of data is changed and even when some aspects of the external representation
are changed.'),
('Changes in data representation will often be needed as a result of changes in query, update, and report traffic and natural growth in the types of stored information.'),
('Existing noninferential, formatted data systems provide users with tree-structured files or slightly more general network models of the data.'),
('In Section 1, inadequacies of these models are discussed.'),
('A model based on n-ary relations, a normal form for data base relations, and the concept of a universal data sublanguage are introduced.'),
('In Section 2, certain operations on relations (other than logical inference) are discussed and applied to the problems of redundancy and consistency in the user’s model.')

query FFFFT
SELECT
ts_rank(v, query) AS rank,
ts_rank(ARRAY[0.2, 0.3, 0.5, 0.9]:::FLOAT[], v, query) AS wrank,
ts_rank(v, query, 2|8) AS nrank,
ts_rank(ARRAY[0.3, 0.4, 0.6, 0.95]:::FLOAT[], v, query, 1|2|4|8|16|32) AS wnrank,
v
FROM sentences, to_tsquery('english', 'relation') query
WHERE query @@ v
ORDER BY rank DESC
LIMIT 10
----
0.075990885 0.15198177 0.00042217158 8.555783e-05 'ari':6 'base':3,13 'concept':17 'data':12,21 'form':10 'introduc':24 'model':2 'n':5 'normal':9 'relat':7,14 'sublanguag':22 'univers':20
0.06079271 0.12158542 0.0003101669 6.095758e-05 '2':3 'appli':15 'certain':4 'consist':22 'discuss':13 'infer':11 'logic':10 'model':27 'oper':5 'problem':18 'redund':20 'relat':7 'section':2 'user':25
1 change: 0 additions & 1 deletion pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -3765,7 +3765,6 @@ value if you rely on the HLC for accuracy.`,
"jsonb_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_delete": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_filter": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_rank": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_rank_cd": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_rewrite": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"tsquery_phrase": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -2373,6 +2373,10 @@ var builtinOidsArray = []string{
2399: `to_tsvector(text: string) -> tsvector`,
2400: `phraseto_tsquery(text: string) -> tsquery`,
2401: `plainto_tsquery(text: string) -> tsquery`,
2402: `ts_rank(weights: float[], vector: tsvector, query: tsquery, normalization: int) -> float4`,
2403: `ts_rank(vector: tsvector, query: tsquery, normalization: int) -> float4`,
2404: `ts_rank(vector: tsvector, query: tsquery) -> float4`,
2405: `ts_rank(weights: float[], vector: tsvector, query: tsquery) -> float4`,
}

var builtinOidsBySignature map[string]oid.Oid
Expand Down
113 changes: 113 additions & 0 deletions pkg/sql/sem/builtins/tsearch_builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,4 +231,117 @@ var tsearchBuiltins = map[string]builtinDefinition{
Volatility: volatility.Stable,
},
),
"ts_rank": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{
{Name: "weights", Typ: types.FloatArray},
{Name: "vector", Typ: types.TSVector},
{Name: "query", Typ: types.TSQuery},
{Name: "normalization", Typ: types.Int},
},
ReturnType: tree.FixedReturnType(types.Float4),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
weights, err := getWeights(tree.MustBeDArray(args[0]))
if err != nil {
return nil, err
}
rank, err := tsearch.Rank(
weights,
tree.MustBeDTSVector(args[1]).TSVector,
tree.MustBeDTSQuery(args[2]).TSQuery,
int(tree.MustBeDInt(args[3])),
)
if err != nil {
return nil, err
}
return tree.NewDFloat(tree.DFloat(rank)), nil
},
Info: "Ranks vectors based on the frequency of their matching lexemes.",
Volatility: volatility.Immutable,
},
tree.Overload{
Types: tree.ParamTypes{
{Name: "weights", Typ: types.FloatArray},
{Name: "vector", Typ: types.TSVector},
{Name: "query", Typ: types.TSQuery},
},
ReturnType: tree.FixedReturnType(types.Float4),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
weights, err := getWeights(tree.MustBeDArray(args[0]))
if err != nil {
return nil, err
}
rank, err := tsearch.Rank(
weights,
tree.MustBeDTSVector(args[1]).TSVector,
tree.MustBeDTSQuery(args[2]).TSQuery,
0,
)
if err != nil {
return nil, err
}
return tree.NewDFloat(tree.DFloat(rank)), nil
},
Info: "Ranks vectors based on the frequency of their matching lexemes.",
Volatility: volatility.Immutable,
},
tree.Overload{
Types: tree.ParamTypes{
{Name: "vector", Typ: types.TSVector},
{Name: "query", Typ: types.TSQuery},
{Name: "normalization", Typ: types.Int},
},
ReturnType: tree.FixedReturnType(types.Float4),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
rank, err := tsearch.Rank(
nil, /* weights */
tree.MustBeDTSVector(args[0]).TSVector,
tree.MustBeDTSQuery(args[1]).TSQuery,
int(tree.MustBeDInt(args[2])),
)
if err != nil {
return nil, err
}
return tree.NewDFloat(tree.DFloat(rank)), nil
},
Info: "Ranks vectors based on the frequency of their matching lexemes.",
Volatility: volatility.Immutable,
},
tree.Overload{
Types: tree.ParamTypes{
{Name: "vector", Typ: types.TSVector},
{Name: "query", Typ: types.TSQuery},
},
ReturnType: tree.FixedReturnType(types.Float4),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
rank, err := tsearch.Rank(
nil, /* weights */
tree.MustBeDTSVector(args[0]).TSVector,
tree.MustBeDTSQuery(args[1]).TSQuery,
0, /* method */
)
if err != nil {
return nil, err
}
return tree.NewDFloat(tree.DFloat(rank)), nil
},
Info: "Ranks vectors based on the frequency of their matching lexemes.",
Volatility: volatility.Immutable,
},
),
}

func getWeights(arr *tree.DArray) ([]float32, error) {
ret := make([]float32, 4)
if arr.Len() < len(ret) {
return ret, pgerror.New(pgcode.ArraySubscript, "array of weight is too short (must be at least 4)")
}
for i, d := range arr.Array {
if d == tree.DNull {
return ret, pgerror.New(pgcode.NullValueNotAllowed, "array of weight must not contain null")
}
ret[i] = float32(tree.MustBeDFloat(d))
}
return ret, nil
}
2 changes: 2 additions & 0 deletions pkg/util/tsearch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
"eval.go",
"lex.go",
"random.go",
"rank.go",
"snowball.go",
"stopwords.go",
"tsquery.go",
Expand Down Expand Up @@ -63,6 +64,7 @@ go_test(
srcs = [
"encoding_test.go",
"eval_test.go",
"rank_test.go",
"tsquery_test.go",
"tsvector_test.go",
],
Expand Down
Loading

0 comments on commit f1ce8cf

Please sign in to comment.