-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
57615: builtins: add fuzzystrmatch soundex and difference builtin functions r=rafiss a=b41sh add fuzzystrmatch `soundex` and `difference` builtin functions Resolves: [#56820](#56820) Release note (sql change): The soundex() and difference() builtin function was added. Co-authored-by: b41sh <[email protected]>
- Loading branch information
Showing
6 changed files
with
264 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
// Copyright 2020 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
package fuzzystrmatch | ||
|
||
import ( | ||
"strings" | ||
"unicode" | ||
) | ||
|
||
// The soundex code consists of four characters. | ||
const soundexLen = 4 | ||
|
||
// ABCDEFGHIJKLMNOPQRSTUVWXYZ | ||
const soundexTable = "01230120022455012623010202" | ||
|
||
func soundexCode(r rune) byte { | ||
letter := byte(unicode.ToUpper(r)) | ||
if letter >= 'A' && letter <= 'Z' { | ||
return soundexTable[int(letter-'A')] | ||
} | ||
return 0x0 | ||
} | ||
|
||
func soundex(source string) string { | ||
// Skip leading non-alphabetic characters | ||
source = strings.TrimLeftFunc(source, func(r rune) bool { | ||
if r <= unicode.MaxASCII { | ||
return !(unicode.IsUpper(r) || unicode.IsLower(r)) | ||
} | ||
return false | ||
}) | ||
code := make([]byte, soundexLen) | ||
// No string left | ||
if len(source) == 0 { | ||
return string(code) | ||
} | ||
runes := []rune(source) | ||
if unicode.IsUpper(runes[0]) || unicode.IsLower(runes[0]) { | ||
// Convert the first character to upper case. | ||
code[0] = byte(unicode.ToUpper(runes[0])) | ||
} | ||
j := 1 | ||
for i := 1; i < len(runes); i++ { | ||
if runes[i] > unicode.MaxASCII { | ||
j++ | ||
} | ||
if (unicode.IsUpper(runes[i]) || unicode.IsLower(runes[i])) && | ||
soundexCode(runes[i]) != soundexCode(runes[i-1]) { | ||
c := soundexCode(runes[i]) | ||
if c != '0' { | ||
code[j] = c | ||
j++ | ||
} | ||
} | ||
if j == soundexLen { | ||
break | ||
} | ||
} | ||
// Fill with 0's at the end | ||
for j < soundexLen { | ||
code[j] = '0' | ||
j++ | ||
} | ||
return string(code) | ||
} | ||
|
||
// Soundex convert source to its Soundex code. | ||
func Soundex(source string) string { | ||
code := soundex(source) | ||
resCode := make([]byte, 0) | ||
for _, b := range []byte(code) { | ||
if b != 0x0 { | ||
resCode = append(resCode, b) | ||
} | ||
} | ||
return string(resCode) | ||
} | ||
|
||
// Difference convert source and target to their Soundex codes | ||
// and then reports the number of matching code positions. | ||
func Difference(source, target string) int { | ||
sourceCode := soundex(source) | ||
targetCode := soundex(target) | ||
|
||
diff := 0 | ||
for i := 0; i < soundexLen; i++ { | ||
if sourceCode[i] == targetCode[i] { | ||
diff++ | ||
} | ||
} | ||
return diff | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
// Copyright 2020 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
package fuzzystrmatch | ||
|
||
import "testing" | ||
|
||
func TestSoundex(t *testing.T) { | ||
tt := []struct { | ||
Source string | ||
Expected string | ||
}{ | ||
{ | ||
Source: "hello world!", | ||
Expected: "H464", | ||
}, | ||
{ | ||
Source: "Anne", | ||
Expected: "A500", | ||
}, | ||
{ | ||
Source: "Ann", | ||
Expected: "A500", | ||
}, | ||
{ | ||
Source: "Andrew", | ||
Expected: "A536", | ||
}, | ||
{ | ||
Source: "Margaret", | ||
Expected: "M626", | ||
}, | ||
{ | ||
Source: "🌞", | ||
Expected: "000", | ||
}, | ||
{ | ||
Source: "😄 🐃 🐯 🕣 💲 🏜 👞 🔠 🌟 📌", | ||
Expected: "", | ||
}, | ||
} | ||
|
||
for _, tc := range tt { | ||
got := Soundex(tc.Source) | ||
if tc.Expected != got { | ||
t.Fatalf("error convert string to its Soundex code with source=%q"+ | ||
" expected %s got %s", tc.Source, tc.Expected, got) | ||
} | ||
} | ||
} | ||
|
||
func TestDifference(t *testing.T) { | ||
tt := []struct { | ||
Source string | ||
Target string | ||
Expected int | ||
}{ | ||
{ | ||
Source: "Anne", | ||
Target: "Ann", | ||
Expected: 4, | ||
}, | ||
{ | ||
Source: "Anne", | ||
Target: "Andrew", | ||
Expected: 2, | ||
}, | ||
{ | ||
Source: "Anne", | ||
Target: "Margaret", | ||
Expected: 0, | ||
}, | ||
} | ||
|
||
for _, tc := range tt { | ||
got := Difference(tc.Source, tc.Target) | ||
if tc.Expected != got { | ||
t.Fatalf("error reports the number of matching code positions with source=%q"+ | ||
" target=%q: expected %d got %d", tc.Source, tc.Target, tc.Expected, got) | ||
} | ||
} | ||
} |