Skip to content

Commit

Permalink
util: avoid allocations when escaping multibyte characters
Browse files Browse the repository at this point in the history
EncodeEscapedChar (which is called in EncodeSQLStringWithFlags)
is pretty optimized, but for escaping a multibyte character it
was using fmt.FPrintf, which means every multibyte character
ended up on the heap due to golang/go#8618.
This had a noticeable impact in changefeed benchmarking.

This commit just hand-compiles the two formatting strings that
were being used into reasonably efficient go, eliminating the allocs.

Benchmark encoding the first 10000 runes shows a 4x speedup:

Before: BenchmarkEncodeNonASCIISQLString-16    	     944	   1216130 ns/op
After: BenchmarkEncodeNonASCIISQLString-16    	    3468	    300777 ns/op

Release note: None
  • Loading branch information
HonoreDB committed Sep 24, 2022
1 parent 7b98d9c commit cb7a002
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 5 deletions.
11 changes: 11 additions & 0 deletions pkg/sql/lexbase/encode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,14 @@ func BenchmarkEncodeSQLString(b *testing.B) {
lexbase.EncodeSQLStringWithFlags(bytes.NewBuffer(nil), str, lexbase.EncBareStrings)
}
}

func BenchmarkEncodeNonASCIISQLString(b *testing.B) {
builder := strings.Builder{}
for r := rune(0); r < 10000; r++ {
builder.WriteRune(r)
}
str := builder.String()
for i := 0; i < b.N; i++ {
lexbase.EncodeSQLStringWithFlags(bytes.NewBuffer(nil), str, lexbase.EncBareStrings)
}
}
24 changes: 19 additions & 5 deletions pkg/util/stringencoding/string_encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ package stringencoding

import (
"bytes"
"fmt"
"unicode/utf8"
)

Expand Down Expand Up @@ -109,14 +108,29 @@ func EncodeEscapedChar(
// Escape non-printable characters.
buf.Write(HexMap[currentByte])
}
} else if ln == 2 {
// For multi-byte runes, print them based on their width.
fmt.Fprintf(buf, `\u%04X`, currentRune)
} else {
fmt.Fprintf(buf, `\U%08X`, currentRune)
writeMultibyteRuneAsHex(buf, currentRune, ln)
}
}

const uppercaseHex = `0123456789ABCDEF`

// writeMultibyteRuneAsHex is equivalent to either
// fmt.FPrintf(`\u%04X`) or fmt.FPrintf(`\U%08X`).
// We can't quite just use strconv since we need uppercase hex.
func writeMultibyteRuneAsHex(buf *bytes.Buffer, r rune, ln int) {
if ln == 2 {
buf.WriteString(`\u0000`)
} else {
buf.WriteString(`\U00000000`)
}
for i := 1; r > 0; r >>= 4 {
buf.Bytes()[buf.Len()-i] = uppercaseHex[r&0x0f]
i++
}

}

func writeHexDigit(buf *bytes.Buffer, v int) {
if v < 10 {
buf.WriteByte('0' + byte(v))
Expand Down

0 comments on commit cb7a002

Please sign in to comment.