Skip to content

Commit

Permalink
coldata: do not allocate wasteful memory for UUIDs
Browse files Browse the repository at this point in the history
Previously, whenever we're allocating a new `Bytes` vector, we would use
64 bytes as an average element size to decide how large of a flat byte
slice to allocate initially. This is wasteful in case of Uuids because
then we know that each element will be exactly of 16 bytes. This commit
uses that knowledge to remove wasteful allocations.

Release note: None
  • Loading branch information
yuzefovich committed Apr 7, 2021
1 parent 651184b commit 107faca
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 29 deletions.
1 change: 1 addition & 0 deletions pkg/col/coldata/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ go_library(
"//pkg/sql/types",
"//pkg/util",
"//pkg/util/duration",
"//pkg/util/uuid",
"@com_github_cockroachdb_apd_v2//:apd",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
Expand Down
10 changes: 8 additions & 2 deletions pkg/col/coldata/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,18 @@ const BytesInitialAllocationFactor = 64
// NewBytes returns a Bytes struct with enough capacity for n zero-length
// []byte values. It is legal to call Set on the returned Bytes at this point,
// but Get is undefined until at least one element is Set.
func NewBytes(n int) *Bytes {
// - avgElementLength determines the average length of a single []byte element
// that will be added to this Bytes. It can be 0 when the length is unknown in
// which case BytesInitialAllocationFactor will be used as a guess.
func NewBytes(n int, avgElementLength int) *Bytes {
if avgElementLength <= 0 {
avgElementLength = BytesInitialAllocationFactor
}
return &Bytes{
// Given that the []byte slices are of variable length, we multiply the
// number of elements by some constant factor.
// TODO(asubiotto): Make this tunable.
data: make([]byte, 0, n*BytesInitialAllocationFactor),
data: make([]byte, 0, n*avgElementLength),
offsets: make([]int32, n+1),
}
}
Expand Down
28 changes: 14 additions & 14 deletions pkg/col/coldata/bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func TestBytesRefImpl(t *testing.T) {
for nRun := 0; nRun < nRuns; nRun++ {
n := 1 + rng.Intn(maxLength)

flat := NewBytes(n)
flat := NewBytes(n, 0)
reference := make([][]byte, n)
for i := 0; i < n; i++ {
v := make([]byte, rng.Intn(16))
Expand All @@ -219,7 +219,7 @@ func TestBytesRefImpl(t *testing.T) {
if rng.Float64() < 0.5 {
selfReferencingSources = false
sourceN := 1 + rng.Intn(maxLength)
flatSource = NewBytes(sourceN)
flatSource = NewBytes(sourceN, 0)
referenceSource = make([][]byte, sourceN)
for i := 0; i < sourceN; i++ {
v := make([]byte, rng.Intn(16))
Expand Down Expand Up @@ -249,7 +249,7 @@ func TestBytes(t *testing.T) {
defer leaktest.AfterTest(t)()

t.Run("Simple", func(t *testing.T) {
b1 := NewBytes(0)
b1 := NewBytes(0, 0)
b1.AppendVal([]byte("hello"))
require.Equal(t, "hello", string(b1.Get(0)))
b1.AppendVal(nil)
Expand Down Expand Up @@ -281,8 +281,8 @@ func TestBytes(t *testing.T) {
})

t.Run("Append", func(t *testing.T) {
b1 := NewBytes(0)
b2 := NewBytes(0)
b1 := NewBytes(0, 0)
b2 := NewBytes(0, 0)
b2.AppendVal([]byte("source bytes value"))
b1.AppendVal([]byte("one"))
b1.AppendVal([]byte("two"))
Expand All @@ -303,7 +303,7 @@ func TestBytes(t *testing.T) {
}
}

b2 = NewBytes(0)
b2 = NewBytes(0, 0)
b2.AppendVal([]byte("hello again"))
b2.AppendVal([]byte("hello again"))
b2.AppendVal([]byte("hello again"))
Expand All @@ -321,7 +321,7 @@ func TestBytes(t *testing.T) {
// truncate the vector. The expected behavior is that offsets must be
// backfilled, and once a new value is appended, it should be
// retrievable.
b := NewBytes(5)
b := NewBytes(5, 0)
b.Set(0, []byte("zero"))
require.Equal(t, 5, b.Len())
b.AppendSlice(b, 3, 0, 0)
Expand All @@ -334,8 +334,8 @@ func TestBytes(t *testing.T) {
})

t.Run("Copy", func(t *testing.T) {
b1 := NewBytes(0)
b2 := NewBytes(0)
b1 := NewBytes(0, 0)
b2 := NewBytes(0, 0)
b1.AppendVal([]byte("one"))
b1.AppendVal([]byte("two"))
b1.AppendVal([]byte("three"))
Expand Down Expand Up @@ -374,7 +374,7 @@ func TestBytes(t *testing.T) {
})

t.Run("Window", func(t *testing.T) {
b1 := NewBytes(0)
b1 := NewBytes(0, 0)
b1.AppendVal([]byte("one"))
b1.AppendVal([]byte("two"))
b1.AppendVal([]byte("three"))
Expand All @@ -390,7 +390,7 @@ func TestBytes(t *testing.T) {
})

t.Run("String", func(t *testing.T) {
b1 := NewBytes(0)
b1 := NewBytes(0, 0)
vals := [][]byte{
[]byte("one"),
[]byte("two"),
Expand Down Expand Up @@ -421,12 +421,12 @@ func TestBytes(t *testing.T) {
})

t.Run("InvariantSimple", func(t *testing.T) {
b1 := NewBytes(8)
b1 := NewBytes(8, 0)
b1.Set(0, []byte("zero"))
other := b1.Window(0, 2)
other.AssertOffsetsAreNonDecreasing(2)

b2 := NewBytes(8)
b2 := NewBytes(8, 0)
b2.Set(0, []byte("zero"))
b2.Set(2, []byte("two"))
other = b2.Window(0, 4)
Expand All @@ -443,7 +443,7 @@ func TestProportionalSize(t *testing.T) {
rng, _ := randutil.NewPseudoRand()
// We need a number divisible by 4.
fullCapacity := (1 + rng.Intn(100)) * 4
b := NewBytes(fullCapacity)
b := NewBytes(fullCapacity, BytesInitialAllocationFactor)
for i := 0; i < fullCapacity; i++ {
b.Set(i, value)
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/col/coldata/vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/col/typeconv"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
)

// Column is an interface that represents a raw array of a Go native type.
Expand Down Expand Up @@ -162,7 +163,10 @@ func (cf *defaultColumnFactory) MakeColumn(t *types.T, length int) Column {
case types.BoolFamily:
return make(Bools, length)
case types.BytesFamily:
return NewBytes(length)
if t.Family() == types.UuidFamily {
return NewBytes(length, uuid.Size)
}
return NewBytes(length, BytesInitialAllocationFactor)
case types.IntFamily:
switch t.Width() {
case 16:
Expand Down
2 changes: 1 addition & 1 deletion pkg/col/colserde/arrowbatchconverter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func BenchmarkArrowBatchConverter(b *testing.B) {
// TODO(asubiotto): We should probably create some random spec struct that
// we pass in to RandomBatch.
bytes := batch.ColVec(0).Bytes()
newBytes := coldata.NewBytes(bytes.Len())
newBytes := coldata.NewBytes(bytes.Len(), 0 /* avgElementLength */)
for i := 0; i < bytes.Len(); i++ {
diff := len(bytes.Get(i)) - fixedLen
if diff < 0 {
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/colmem/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ go_library(
"//pkg/sql/types",
"//pkg/util/duration",
"//pkg/util/mon",
"//pkg/util/uuid",
"@com_github_cockroachdb_apd_v2//:apd",
"@com_github_cockroachdb_errors//:errors",
],
Expand Down
35 changes: 24 additions & 11 deletions pkg/sql/colmem/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/duration"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -370,12 +371,19 @@ func EstimateBatchSizeBytes(vecTypes []*types.T, batchLength int) int {
// (excluding any Bytes vectors, those are tracked separately).
acc := 0
numBytesVectors := 0
// We will track Uuid vectors separately because they use smaller initial
// allocation factor.
numUUIDVectors := 0
for _, t := range vecTypes {
switch typeconv.TypeFamilyToCanonicalTypeFamily(t.Family()) {
case types.BoolFamily:
acc += sizeOfBool
case types.BytesFamily:
numBytesVectors++
if t.Family() == types.UuidFamily {
numUUIDVectors++
} else {
numBytesVectors++
}
case types.IntFamily:
switch t.Width() {
case 16:
Expand Down Expand Up @@ -416,15 +424,20 @@ func EstimateBatchSizeBytes(vecTypes []*types.T, batchLength int) int {
colexecerror.InternalError(errors.AssertionFailedf("unhandled type %s", t))
}
}
// For byte arrays, we initially allocate BytesInitialAllocationFactor
// number of bytes (plus an int32 for the offset) for each row, so we use
// the sum of two values as the estimate. However, later, the exact
// memory footprint will be used: whenever a modification of Bytes takes
// place, the Allocator will measure the old footprint and the updated
// one and will update the memory account accordingly. We also account for
// the overhead and for the additional offset value that are needed for
// Bytes vectors (to be in line with coldata.Bytes.Size() method).
bytesVectorsSize := numBytesVectors * (int(coldata.FlatBytesOverhead) +
coldata.BytesInitialAllocationFactor*batchLength + sizeOfInt32*(batchLength+1))
// For byte arrays, we initially allocate a constant number of bytes (plus
// an int32 for the offset) for each row, so we use the sum of two values as
// the estimate. However, later, the exact memory footprint will be used:
// whenever a modification of Bytes takes place, the Allocator will measure
// the old footprint and the updated one and will update the memory account
// accordingly. We also account for the overhead and for the additional
// offset value that are needed for Bytes vectors (to be in line with
// coldata.Bytes.Size() method).
var bytesVectorsSize int
// Add the overhead.
bytesVectorsSize += (numBytesVectors + numUUIDVectors) * (int(coldata.FlatBytesOverhead))
// Add the data for both Bytes and Uuids.
bytesVectorsSize += (numBytesVectors*coldata.BytesInitialAllocationFactor + numUUIDVectors*uuid.Size) * batchLength
// Add the offsets.
bytesVectorsSize += (numBytesVectors + numUUIDVectors) * sizeOfInt32 * (batchLength + 1)
return acc*batchLength + bytesVectorsSize
}

0 comments on commit 107faca

Please sign in to comment.