trigram: performance improvements

``` name old time/op new time/op delta Similarity-10 1.05µs ± 0% 0.77µs ± 0% -26.81% (p=0.000 n=7+10) name old alloc/op new alloc/op delta Similarity-10 824B ± 0% 376B ± 0% -54.37% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Similarity-10 13.0 ± 0% 7.0 ± 0% -46.15% (p=0.000 n=10+10) ``` Release note (sql change): improve the performance of trigram operations
cockroachdb · Dec 22, 2022 · e4abe68 · e4abe68
1 parent 5ea0f1c
commit e4abe68
Showing 1 changed file with 80 additions and 44 deletions.
diff --git a/pkg/util/trigram/trigram.go b/pkg/util/trigram/trigram.go
@@ -11,15 +11,15 @@
 package trigram
 
 import (
-	"fmt"
 	"sort"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )
 
 // MakeTrigrams returns the downcased, sorted and de-duplicated trigrams for an
-// input string. Non-alphanumeric characters are treated as word boundaries.
+// input string. Non-alphanumeric characters (calculated via unicode's Letter
+// and Number designations) are treated as word boundaries.
 // Words are separately trigrammed. If pad is true, the string will be padded
 // with 2 spaces at the front and 1 at the back, producing 3 extra trigrams.
 func MakeTrigrams(s string, pad bool) []string {
@@ -30,54 +30,43 @@ func MakeTrigrams(s string, pad bool) []string {
 	// Downcase the initial string.
 	s = strings.ToLower(s)
 
-	// Find words.
-	words := strings.FieldsFunc(s, func(r rune) bool {
-		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
-	})
-
 	// Approximately pre-size as if the string is all 1 big word.
-	output := make([]string, 0, len(s))
+	output := make([]string, 0, len(s)+2)
 
-	for _, word := range words {
-		if pad {
-			word = fmt.Sprintf("  %s ", word)
-		}
-		nRunes := utf8.RuneCountInString(word)
-		if nRunes == len(word) {
-			// Fast path for words that have no wide characters.
-			// If not padding, n will be less than 0, so we'll leave the loop as
-			// desired, since words less than length 3 have no trigrams.
-			n := len(word) - 2
-			for i := 0; i < n; i++ {
-				output = append(output, word[i:i+3])
+	start := -1
+	oneByteCharsOnly := true
+	// Loop through the input string searching for word boundaries. For each found
+	// word, generate trigrams and add to the output list. The start and end
+	// variables are used to track the beginning and end of the current word
+	// throughout the loop.
+	// This loop would be more ergonomic with strings.FieldsFunc, but doing so
+	// costs twice the allocations.
+	for end, r := range s {
+		if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
+			// Non-word char.
+			if start < 0 {
+				// Keep going until we find a word char to start the run.
+				continue
 			}
+			// We found a word span. Reset the span and handle it below.
 		} else {
-			// There are some wide characters, so we need to assemble trigrams
-			// in a more careful way than just taking 3-byte windows: we have to
-			// decode each code point to find its width so we can make
-			// windows of 3 codepoints.
-			//
-			// Note that this behavior differs from Postgres: Postgres computes
-			// a hash of the 3 codepoint windows and takes the first 3 bytes of
-			// the hash as the trigram. This is due to limitations in Postgres
-			// and is a dubious way of computing a trigram.
-			// Our method should provide fewer false positives, but note that
-			// users shouldn't see any differences due to this change.
-			nFound := 0
-			charWidths := []int{0, 0}
-			for i, w := 0, 0; i < len(word); i += w {
-				_, w = utf8.DecodeRuneInString(word[i:])
-				if nFound < 2 {
-					charWidths[nFound] = w
-					nFound += 1
-					continue
-				}
-				// Now that we've found our first 2 widths, we can begin assembling the
-				// trigrams.
-				output = append(output, word[i-charWidths[0]-charWidths[1]:i+w])
-				charWidths[0], charWidths[1] = charWidths[1], w
+			// Word char.
+			if start < 0 {
+				start = end
+			}
+			if oneByteCharsOnly && r >= utf8.RuneSelf {
+				oneByteCharsOnly = false
 			}
+			continue
 		}
+
+		output = generateTrigrams(output, s[start:end], pad, oneByteCharsOnly)
+		oneByteCharsOnly = true
+		start = -1
+	}
+	if start >= 0 {
+		// Collect final word.
+		output = generateTrigrams(output, s[start:], pad, oneByteCharsOnly)
 	}
 
 	if len(output) == 0 {
@@ -103,6 +92,53 @@ func MakeTrigrams(s string, pad bool) []string {
 	return output
 }
 
+func generateTrigrams(appendTo []string, word string, pad bool, onlyOneByteChars bool) []string {
+	if pad {
+		var sb strings.Builder
+		sb.Grow(len(word) + 3)
+		sb.WriteString("  ")
+		sb.WriteString(word)
+		sb.WriteByte(' ')
+		word = sb.String()
+	}
+	if onlyOneByteChars {
+		// Fast path for words that have no wide characters.
+		// If not padding, n will be less than 0, so we'll leave the loop as
+		// desired, since words less than length 3 have no trigrams.
+		n := len(word) - 2
+		for i := 0; i < n; i++ {
+			appendTo = append(appendTo, word[i:i+3])
+		}
+	} else {
+		// There are some wide characters, so we need to assemble trigrams
+		// in a more careful way than just taking 3-byte windows: we have to
+		// decode each code point to find its width so we can make
+		// windows of 3 codepoints.
+		//
+		// Note that this behavior differs from Postgres: Postgres computes
+		// a hash of the 3 codepoint windows and takes the first 3 bytes of
+		// the hash as the trigram. This is due to limitations in Postgres
+		// and is a dubious way of computing a trigram.
+		// Our method should provide fewer false positives, but note that
+		// users shouldn't see any differences due to this change.
+		nFound := 0
+		charWidths := []int{0, 0}
+		for i, w := 0, 0; i < len(word); i += w {
+			_, w = utf8.DecodeRuneInString(word[i:])
+			if nFound < 2 {
+				charWidths[nFound] = w
+				nFound += 1
+				continue
+			}
+			// Now that we've found our first 2 widths, we can begin assembling the
+			// trigrams.
+			appendTo = append(appendTo, word[i-charWidths[0]-charWidths[1]:i+w])
+			charWidths[0], charWidths[1] = charWidths[1], w
+		}
+	}
+	return appendTo
+}
+
 // Similarity returns a trigram similarity measure between two strings. 1.0
 // means the trigrams are identical, 0.0 means no trigrams were shared.
 func Similarity(l string, r string) float64 {