Skip to content

Commit

Permalink
tsearch: add optimize StringSize method
Browse files Browse the repository at this point in the history
This commit introduces `TSVector.StringSize` method which is optimized
equivalent of `len(TSVector.String())` which avoids the construction of
the string. This method is now utilized by `DTSVector.Size`. This
results in some code duplication but it seems worth it:
```
                       │ /tmp/tmp.zD8FYqMrdo/bench.HEAD^ │   /tmp/tmp.zD8FYqMrdo/bench.HEAD    │
                       │             sec/op              │   sec/op     vs base                │
TSVector/String-24                           8.404m ± 0%   8.135m ± 0%   -3.20% (p=0.000 n=10)
TSVector/StringSize-24                       8.381m ± 0%   3.547m ± 0%  -57.67% (p=0.000 n=10)
geomean                                      8.393m        5.372m       -35.99%

                       │ /tmp/tmp.zD8FYqMrdo/bench.HEAD^ │    /tmp/tmp.zD8FYqMrdo/bench.HEAD    │
                       │              B/op               │     B/op      vs base                │
TSVector/String-24                          2.349Mi ± 0%   2.323Mi ± 0%   -1.12% (p=0.000 n=10)
TSVector/StringSize-24                     2405.6Ki ± 0%   192.1Ki ± 0%  -92.02% (p=0.000 n=10)
geomean                                     2.349Mi        675.9Ki       -71.90%

                       │ /tmp/tmp.zD8FYqMrdo/bench.HEAD^ │   /tmp/tmp.zD8FYqMrdo/bench.HEAD    │
                       │            allocs/op            │  allocs/op   vs base                │
TSVector/String-24                           106.7k ± 0%   105.8k ± 0%   -0.88% (p=0.000 n=10)
TSVector/StringSize-24                      106.72k ± 0%   61.46k ± 0%  -42.41% (p=0.000 n=10)
geomean                                      106.7k        80.63k       -24.44%
```

Release note: None
  • Loading branch information
yuzefovich committed Mar 15, 2023
1 parent 13d99ec commit 1dcb587
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pkg/sql/sem/tree/datum.go
Original file line number Diff line number Diff line change
Expand Up @@ -4059,7 +4059,7 @@ func (d *DTSVector) Min(_ CompareContext) (Datum, bool) {

// Size implements the Datum interface.
func (d *DTSVector) Size() uintptr {
return uintptr(len(d.TSVector.String()))
return uintptr(d.TSVector.StringSize())
}

// AsDTSVector attempts to retrieve a DTSVector from an Expr, returning a
Expand Down
65 changes: 65 additions & 0 deletions pkg/util/tsearch/tsvector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
package tsearch

import (
"math/bits"
"sort"
"strconv"
"strings"
"unicode"
"unicode/utf8"

"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
Expand Down Expand Up @@ -72,6 +74,7 @@ const (
weightAny = weightA | weightB | weightC | weightD
)

// NB: must be kept in sync with stringSize().
func (w tsWeight) writeString(buf *strings.Builder) {
if w&weightStar != 0 {
buf.WriteByte('*')
Expand All @@ -90,6 +93,14 @@ func (w tsWeight) writeString(buf *strings.Builder) {
}
}

// stringSize returns the length of the string that corresponds to this
// tsWeight.
// NB: must be kept in sync with writeString().
func (w tsWeight) stringSize() int {
// Count the number of bits set in the lowest 5 bits.
return bits.OnesCount8(uint8(w & 31))
}

// TSVectorPGEncoding returns the PG-compatible wire protocol encoding for a
// given weight. Note that this is only allowable for TSVector tsweights, which
// can't have more than one weight set at the same time. In a TSQuery, you might
Expand Down Expand Up @@ -164,6 +175,7 @@ func newLexemeTerm(lexeme string) (tsTerm, error) {
return tsTerm{lexeme: lexeme}, nil
}

// NB: must be kept in sync with stringSize().
func (t tsTerm) writeString(buf *strings.Builder) {
if t.operator != 0 {
switch t.operator {
Expand Down Expand Up @@ -217,6 +229,46 @@ func (t tsTerm) writeString(buf *strings.Builder) {
}
}

// stringSize returns the length of the string representation of this tsTerm.
// NB: must be kept in sync with writeString().
func (t tsTerm) stringSize() int {
if t.operator != 0 {
switch t.operator {
case and, or, not, lparen, rparen:
return 1
case followedby:
if t.followedN == 1 {
return 3 // '<->'
}
return 2 + len(strconv.Itoa(int(t.followedN))) // fmt.Sprintf("<%d>", t.followedN)
}
}
size := 1 // '\''
for _, r := range t.lexeme {
if r == '\'' {
// Single quotes are escaped as double single quotes inside of a
// TSVector.
size += 2
} else {
// Compare as uint32 to correctly handle negative runes.
if uint32(r) < utf8.RuneSelf {
size++
} else {
size += utf8.RuneLen(r)
}
}
}
size++ // '\''
size += len(t.positions) // ':' or ',' for each position
for _, pos := range t.positions {
if pos.position > 0 {
size += len(strconv.Itoa(int(pos.position)))
}
size += pos.weight.stringSize()
}
return size
}

func (t tsTerm) matchesWeight(targetWeight tsWeight) bool {
if targetWeight == weightAny {
return true
Expand Down Expand Up @@ -249,6 +301,19 @@ func (t TSVector) String() string {
return buf.String()
}

// StringSize returns the length of the string that would have been returned on
// String() call, without actually constructing that string.
func (t TSVector) StringSize() int {
var size int
if len(t) > 0 {
size = len(t) - 1 // space
}
for _, term := range t {
size += term.stringSize()
}
return size
}

// ParseTSVector produces a TSVector from an input string. The input will be
// sorted by lexeme, but will not be automatically stemmed or stop-worded.
func ParseTSVector(input string) (TSVector, error) {
Expand Down
10 changes: 9 additions & 1 deletion pkg/util/tsearch/tsvector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@ func TestParseTSRandom(t *testing.T) {
}
}

func TestTSVectorStringSize(t *testing.T) {
r, _ := randutil.NewTestRand()
for i := 0; i < 1000; i++ {
v := RandomTSVector(r)
require.Equal(t, len(v.String()), v.StringSize())
}
}

func BenchmarkTSVector(b *testing.B) {
r, _ := randutil.NewTestRand()
tsVectors := make([]TSVector, 10000)
Expand All @@ -205,7 +213,7 @@ func BenchmarkTSVector(b *testing.B) {
b.Run("StringSize", func(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, v := range tsVectors {
_ = len(v.String())
_ = v.StringSize()
}
}
})
Expand Down

0 comments on commit 1dcb587

Please sign in to comment.