Skip to content

Commit

Permalink
trigram: performance improvements
Browse files Browse the repository at this point in the history
```
name           old time/op    new time/op    delta
Similarity-10    1.05µs ± 0%    0.77µs ± 0%  -26.81%  (p=0.000 n=7+10)

name           old alloc/op   new alloc/op   delta
Similarity-10      824B ± 0%      376B ± 0%  -54.37%  (p=0.000 n=10+10)

name           old allocs/op  new allocs/op  delta
Similarity-10      13.0 ± 0%       7.0 ± 0%  -46.15%  (p=0.000 n=10+10)
```

Release note (sql change): improve the performance of trigram operations
  • Loading branch information
jordanlewis committed Dec 22, 2022
1 parent 5ea0f1c commit e4abe68
Showing 1 changed file with 80 additions and 44 deletions.
124 changes: 80 additions & 44 deletions pkg/util/trigram/trigram.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
package trigram

import (
"fmt"
"sort"
"strings"
"unicode"
"unicode/utf8"
)

// MakeTrigrams returns the downcased, sorted and de-duplicated trigrams for an
// input string. Non-alphanumeric characters are treated as word boundaries.
// input string. Non-alphanumeric characters (calculated via unicode's Letter
// and Number designations) are treated as word boundaries.
// Words are separately trigrammed. If pad is true, the string will be padded
// with 2 spaces at the front and 1 at the back, producing 3 extra trigrams.
func MakeTrigrams(s string, pad bool) []string {
Expand All @@ -30,54 +30,43 @@ func MakeTrigrams(s string, pad bool) []string {
// Downcase the initial string.
s = strings.ToLower(s)

// Find words.
words := strings.FieldsFunc(s, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})

// Approximately pre-size as if the string is all 1 big word.
output := make([]string, 0, len(s))
output := make([]string, 0, len(s)+2)

for _, word := range words {
if pad {
word = fmt.Sprintf(" %s ", word)
}
nRunes := utf8.RuneCountInString(word)
if nRunes == len(word) {
// Fast path for words that have no wide characters.
// If not padding, n will be less than 0, so we'll leave the loop as
// desired, since words less than length 3 have no trigrams.
n := len(word) - 2
for i := 0; i < n; i++ {
output = append(output, word[i:i+3])
start := -1
oneByteCharsOnly := true
// Loop through the input string searching for word boundaries. For each found
// word, generate trigrams and add to the output list. The start and end
// variables are used to track the beginning and end of the current word
// throughout the loop.
// This loop would be more ergonomic with strings.FieldsFunc, but doing so
// costs twice the allocations.
for end, r := range s {
if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
// Non-word char.
if start < 0 {
// Keep going until we find a word char to start the run.
continue
}
// We found a word span. Reset the span and handle it below.
} else {
// There are some wide characters, so we need to assemble trigrams
// in a more careful way than just taking 3-byte windows: we have to
// decode each code point to find its width so we can make
// windows of 3 codepoints.
//
// Note that this behavior differs from Postgres: Postgres computes
// a hash of the 3 codepoint windows and takes the first 3 bytes of
// the hash as the trigram. This is due to limitations in Postgres
// and is a dubious way of computing a trigram.
// Our method should provide fewer false positives, but note that
// users shouldn't see any differences due to this change.
nFound := 0
charWidths := []int{0, 0}
for i, w := 0, 0; i < len(word); i += w {
_, w = utf8.DecodeRuneInString(word[i:])
if nFound < 2 {
charWidths[nFound] = w
nFound += 1
continue
}
// Now that we've found our first 2 widths, we can begin assembling the
// trigrams.
output = append(output, word[i-charWidths[0]-charWidths[1]:i+w])
charWidths[0], charWidths[1] = charWidths[1], w
// Word char.
if start < 0 {
start = end
}
if oneByteCharsOnly && r >= utf8.RuneSelf {
oneByteCharsOnly = false
}
continue
}

output = generateTrigrams(output, s[start:end], pad, oneByteCharsOnly)
oneByteCharsOnly = true
start = -1
}
if start >= 0 {
// Collect final word.
output = generateTrigrams(output, s[start:], pad, oneByteCharsOnly)
}

if len(output) == 0 {
Expand All @@ -103,6 +92,53 @@ func MakeTrigrams(s string, pad bool) []string {
return output
}

func generateTrigrams(appendTo []string, word string, pad bool, onlyOneByteChars bool) []string {
if pad {
var sb strings.Builder
sb.Grow(len(word) + 3)
sb.WriteString(" ")
sb.WriteString(word)
sb.WriteByte(' ')
word = sb.String()
}
if onlyOneByteChars {
// Fast path for words that have no wide characters.
// If not padding, n will be less than 0, so we'll leave the loop as
// desired, since words less than length 3 have no trigrams.
n := len(word) - 2
for i := 0; i < n; i++ {
appendTo = append(appendTo, word[i:i+3])
}
} else {
// There are some wide characters, so we need to assemble trigrams
// in a more careful way than just taking 3-byte windows: we have to
// decode each code point to find its width so we can make
// windows of 3 codepoints.
//
// Note that this behavior differs from Postgres: Postgres computes
// a hash of the 3 codepoint windows and takes the first 3 bytes of
// the hash as the trigram. This is due to limitations in Postgres
// and is a dubious way of computing a trigram.
// Our method should provide fewer false positives, but note that
// users shouldn't see any differences due to this change.
nFound := 0
charWidths := []int{0, 0}
for i, w := 0, 0; i < len(word); i += w {
_, w = utf8.DecodeRuneInString(word[i:])
if nFound < 2 {
charWidths[nFound] = w
nFound += 1
continue
}
// Now that we've found our first 2 widths, we can begin assembling the
// trigrams.
appendTo = append(appendTo, word[i-charWidths[0]-charWidths[1]:i+w])
charWidths[0], charWidths[1] = charWidths[1], w
}
}
return appendTo
}

// Similarity returns a trigram similarity measure between two strings. 1.0
// means the trigrams are identical, 0.0 means no trigrams were shared.
func Similarity(l string, r string) float64 {
Expand Down

0 comments on commit e4abe68

Please sign in to comment.