Skip to content

Commit

Permalink
Merge #57615
Browse files Browse the repository at this point in the history
57615: builtins: add fuzzystrmatch soundex and difference builtin functions r=rafiss a=b41sh

add fuzzystrmatch `soundex` and `difference` builtin functions

Resolves: [#56820](#56820)

Release note (sql change): The soundex() and difference() builtin function was added.

Co-authored-by: b41sh <[email protected]>
  • Loading branch information
craig[bot] and b41sh committed Dec 11, 2020
2 parents 8561bd4 + 012a90b commit b325be3
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 4 deletions.
4 changes: 4 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2267,6 +2267,8 @@ The swap_ordinate_string parameter is a 2-character string naming the ordinates
</span></td></tr>
<tr><td><a name="decode"></a><code>decode(text: <a href="string.html">string</a>, format: <a href="string.html">string</a>) &rarr; <a href="bytes.html">bytes</a></code></td><td><span class="funcdesc"><p>Decodes <code>data</code> using <code>format</code> (<code>hex</code> / <code>escape</code> / <code>base64</code>).</p>
</span></td></tr>
<tr><td><a name="difference"></a><code>difference(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert two strings to their Soundex codes and then reports the number of matching code positions.</p>
</span></td></tr>
<tr><td><a name="encode"></a><code>encode(data: <a href="bytes.html">bytes</a>, format: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Encodes <code>data</code> using <code>format</code> (<code>hex</code> / <code>escape</code> / <code>base64</code>).</p>
</span></td></tr>
<tr><td><a name="from_ip"></a><code>from_ip(val: <a href="bytes.html">bytes</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Converts the byte string representation of an IP to its character string representation.</p>
Expand Down Expand Up @@ -2447,6 +2449,8 @@ The swap_ordinate_string parameter is a 2-character string naming the ordinates
</span></td></tr>
<tr><td><a name="sha512"></a><code>sha512(<a href="string.html">string</a>...) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Calculates the SHA512 hash value of a set of values.</p>
</span></td></tr>
<tr><td><a name="soundex"></a><code>soundex(source: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert a string to its Soundex code.</p>
</span></td></tr>
<tr><td><a name="split_part"></a><code>split_part(input: <a href="string.html">string</a>, delimiter: <a href="string.html">string</a>, return_index_pos: <a href="int.html">int</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Splits <code>input</code> on <code>delimiter</code> and return the value in the <code>return_index_pos</code> position (starting at 1).</p>
<p>For example, <code>split_part('123.456.789.0','.',3)</code>returns <code>789</code>.</p>
</span></td></tr>
Expand Down
26 changes: 26 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/fuzzystrmatch
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,29 @@ apple banana 5 18
NULL a NULL NULL
a NULL NULL NULL
NULL NULL NULL NULL

query T
SELECT soundex('hello world!')
----
H464

query TTT
SELECT soundex('Anne'), soundex('Ann'), difference('Anne', 'Ann');
----
A500 A500 4

query TTT
SELECT soundex('Anne'), soundex('Andrew'), difference('Anne', 'Andrew');
----
A500 A536 2

query TTT
SELECT soundex('Anne'), soundex('Margaret'), difference('Anne', 'Margaret');
----
A500 M626 0

query TTT
SELECT soundex('Anne'), soundex(NULL), difference('Anne', NULL);
----
A500 · ·

39 changes: 37 additions & 2 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -2929,8 +2929,43 @@ may increase either contention or retry errors, or both.`,
"tsvector_update_trigger_column": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: categoryFullTextSearch}),

// Fuzzy String Matching
"soundex": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: categoryFuzzyStringMatching}),
"difference": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: categoryFuzzyStringMatching}),
"soundex": makeBuiltin(
tree.FunctionProperties{NullableArgs: true, Category: categoryString},
tree.Overload{
Types: tree.ArgTypes{{"source", types.String}},
ReturnType: tree.FixedReturnType(types.String),
Fn: func(evalCtx *tree.EvalContext, args tree.Datums) (tree.Datum, error) {
if args[0] == tree.DNull {
return tree.NewDString(""), nil
}
s := string(tree.MustBeDString(args[0]))
t := fuzzystrmatch.Soundex(s)
return tree.NewDString(t), nil
},
Info: "Convert a string to its Soundex code.",
Volatility: tree.VolatilityImmutable,
},
),
// The function is confusingly named, `similarity` would have been a better name,
// but this name matches the name in PostgreSQL.
// See https://www.postgresql.org/docs/current/fuzzystrmatch.html"
"difference": makeBuiltin(
tree.FunctionProperties{NullableArgs: true, Category: categoryString},
tree.Overload{
Types: tree.ArgTypes{{"source", types.String}, {"target", types.String}},
ReturnType: tree.FixedReturnType(types.String),
Fn: func(evalCtx *tree.EvalContext, args tree.Datums) (tree.Datum, error) {
if args[0] == tree.DNull || args[1] == tree.DNull {
return tree.NewDString(""), nil
}
s, t := string(tree.MustBeDString(args[0])), string(tree.MustBeDString(args[1]))
diff := fuzzystrmatch.Difference(s, t)
return tree.NewDString(strconv.Itoa(diff)), nil
},
Info: "Convert two strings to their Soundex codes and then reports the number of matching code positions.",
Volatility: tree.VolatilityImmutable,
},
),
"levenshtein": makeBuiltin(defProps(),
tree.Overload{
Types: tree.ArgTypes{{"source", types.String}, {"target", types.String}},
Expand Down
10 changes: 8 additions & 2 deletions pkg/util/fuzzystrmatch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "fuzzystrmatch",
srcs = ["leven.go"],
srcs = [
"leven.go",
"soundex.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/util/fuzzystrmatch",
visibility = ["//visibility:public"],
)

go_test(
name = "fuzzystrmatch_test",
srcs = ["leven_test.go"],
srcs = [
"leven_test.go",
"soundex_test.go",
],
embed = [":fuzzystrmatch"],
)
100 changes: 100 additions & 0 deletions pkg/util/fuzzystrmatch/soundex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package fuzzystrmatch

import (
"strings"
"unicode"
)

// The soundex code consists of four characters.
const soundexLen = 4

// ABCDEFGHIJKLMNOPQRSTUVWXYZ
const soundexTable = "01230120022455012623010202"

func soundexCode(r rune) byte {
letter := byte(unicode.ToUpper(r))
if letter >= 'A' && letter <= 'Z' {
return soundexTable[int(letter-'A')]
}
return 0x0
}

func soundex(source string) string {
// Skip leading non-alphabetic characters
source = strings.TrimLeftFunc(source, func(r rune) bool {
if r <= unicode.MaxASCII {
return !(unicode.IsUpper(r) || unicode.IsLower(r))
}
return false
})
code := make([]byte, soundexLen)
// No string left
if len(source) == 0 {
return string(code)
}
runes := []rune(source)
if unicode.IsUpper(runes[0]) || unicode.IsLower(runes[0]) {
// Convert the first character to upper case.
code[0] = byte(unicode.ToUpper(runes[0]))
}
j := 1
for i := 1; i < len(runes); i++ {
if runes[i] > unicode.MaxASCII {
j++
}
if (unicode.IsUpper(runes[i]) || unicode.IsLower(runes[i])) &&
soundexCode(runes[i]) != soundexCode(runes[i-1]) {
c := soundexCode(runes[i])
if c != '0' {
code[j] = c
j++
}
}
if j == soundexLen {
break
}
}
// Fill with 0's at the end
for j < soundexLen {
code[j] = '0'
j++
}
return string(code)
}

// Soundex convert source to its Soundex code.
func Soundex(source string) string {
code := soundex(source)
resCode := make([]byte, 0)
for _, b := range []byte(code) {
if b != 0x0 {
resCode = append(resCode, b)
}
}
return string(resCode)
}

// Difference convert source and target to their Soundex codes
// and then reports the number of matching code positions.
func Difference(source, target string) int {
sourceCode := soundex(source)
targetCode := soundex(target)

diff := 0
for i := 0; i < soundexLen; i++ {
if sourceCode[i] == targetCode[i] {
diff++
}
}
return diff
}
89 changes: 89 additions & 0 deletions pkg/util/fuzzystrmatch/soundex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package fuzzystrmatch

import "testing"

func TestSoundex(t *testing.T) {
tt := []struct {
Source string
Expected string
}{
{
Source: "hello world!",
Expected: "H464",
},
{
Source: "Anne",
Expected: "A500",
},
{
Source: "Ann",
Expected: "A500",
},
{
Source: "Andrew",
Expected: "A536",
},
{
Source: "Margaret",
Expected: "M626",
},
{
Source: "🌞",
Expected: "000",
},
{
Source: "😄 🐃 🐯 🕣 💲 🏜 👞 🔠 🌟 📌",
Expected: "",
},
}

for _, tc := range tt {
got := Soundex(tc.Source)
if tc.Expected != got {
t.Fatalf("error convert string to its Soundex code with source=%q"+
" expected %s got %s", tc.Source, tc.Expected, got)
}
}
}

func TestDifference(t *testing.T) {
tt := []struct {
Source string
Target string
Expected int
}{
{
Source: "Anne",
Target: "Ann",
Expected: 4,
},
{
Source: "Anne",
Target: "Andrew",
Expected: 2,
},
{
Source: "Anne",
Target: "Margaret",
Expected: 0,
},
}

for _, tc := range tt {
got := Difference(tc.Source, tc.Target)
if tc.Expected != got {
t.Fatalf("error reports the number of matching code positions with source=%q"+
" target=%q: expected %d got %d", tc.Source, tc.Target, tc.Expected, got)
}
}
}

0 comments on commit b325be3

Please sign in to comment.