Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
93400: coldata: add native support of enums r=yuzefovich a=yuzefovich

This commit adds the native support of enum types to the vectorized
engine. We store them via their physical representation, so we can
easily reuse `Bytes` vector for almost all operations, and, thus, we
just mark the enum family as having the bytes family as its canonical
representation. There are only a handful of places where we need to go
from the physical representation to either the logical one or to the
`DEnum`:
- when constructing the pgwire message to the client (in both text and
binary format the logical representation is used)
- when converting from columnar to row-by-row format (fully-fledged
`DEnum` is constructed)
- casts.

In all of these places we already have access to the precise typing
information (similar to what we have for UUIDs which are supported via
the bytes canonical type family already).

I can really see only one downside to such implementation - in some
places the resolution based on the canonical (rather than actual) type
family might be too coarse. For example, we have `<bytes> || <bytes>`
binary operator (`concat`). As it currently stands the execution will
proceed to perform the concatenation between two UUIDs or between a
BYTES value and a UUID, and now we'll be adding enums into the mix.
However, the type checking is performed earlier on the query execution
path, so I think it is acceptable since the execution should never
reach such a setup.

An additional benefit of this work is that we'll be able to support the
KV projection pushdown in presence of enums - on the KV server side
we'll just operate with the physical representations and won't need to
have access to the hydrated type whereas on the client side we'll have
the hydrated type, so we'll be able to do all operations.

Addresses: cockroachdb#42043.
Informs: cockroachdb#92954.

Epic: CRDB-14837

Release note: None

Co-authored-by: Yahor Yuzefovich <[email protected]>
  • Loading branch information
craig[bot] and yuzefovich committed Dec 14, 2022
2 parents 7506ef3 + 5a0d4b2 commit 54b950d
Show file tree
Hide file tree
Showing 20 changed files with 751 additions and 39 deletions.
33 changes: 22 additions & 11 deletions pkg/col/coldatatestutils/random_testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,30 @@ func RandomVec(args RandomVecArgs) {
}
case types.BytesFamily:
bytes := args.Vec.Bytes()
isUUID := args.Vec.Type().Family() == types.UuidFamily
for i := 0; i < args.N; i++ {
bytesLen := args.BytesFixedLength
if bytesLen <= 0 {
bytesLen = args.Rand.Intn(maxVarLen)
if args.Vec.Type().Family() == types.EnumFamily {
enumMeta := args.Vec.Type().TypeMeta.EnumData
if enumMeta == nil {
colexecerror.InternalError(errors.AssertionFailedf("unexpectedly empty enum metadata in RandomVec"))
}
reps := enumMeta.PhysicalRepresentations
for i := 0; i < args.N; i++ {
bytes.Set(i, reps[args.Rand.Intn(len(reps))])
}
if isUUID {
bytesLen = uuid.Size
} else {
isUUID := args.Vec.Type().Family() == types.UuidFamily
for i := 0; i < args.N; i++ {
bytesLen := args.BytesFixedLength
if bytesLen <= 0 {
bytesLen = args.Rand.Intn(maxVarLen)
}
if isUUID {
bytesLen = uuid.Size
}
randBytes := make([]byte, bytesLen)
// Read always returns len(bytes[i]) and nil.
_, _ = rand.Read(randBytes)
bytes.Set(i, randBytes)
}
randBytes := make([]byte, bytesLen)
// Read always returns len(bytes[i]) and nil.
_, _ = rand.Read(randBytes)
bytes.Set(i, randBytes)
}
case types.DecimalFamily:
decs := args.Vec.Decimal()
Expand Down
1 change: 1 addition & 0 deletions pkg/col/colserde/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ go_test(
"@com_github_apache_arrow_go_arrow//array",
"@com_github_apache_arrow_go_arrow//memory",
"@com_github_cockroachdb_apd_v3//:apd",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
],
)
Expand Down
15 changes: 15 additions & 0 deletions pkg/col/colserde/record_batch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -105,6 +106,20 @@ func randomDataFromType(rng *rand.Rand, t *types.T, n int, nullProbability float
}
builder.(*array.Float64Builder).AppendValues(data, valid)
case types.BytesFamily:
if t.Family() == types.EnumFamily {
enumMeta := t.TypeMeta.EnumData
if enumMeta == nil {
panic(errors.AssertionFailedf("unexpectedly empty enum metadata in RandomVec"))
}
builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
data := make([][]byte, n)
reps := enumMeta.PhysicalRepresentations
for i := range data {
data[i] = reps[rng.Intn(len(reps))]
}
builder.(*array.BinaryBuilder).AppendValues(data, valid)
break
}
// Bytes can be represented 3 different ways. As variable-length bytes,
// variable-length strings, or fixed-width bytes.
representation := rng.Intn(2)
Expand Down
8 changes: 7 additions & 1 deletion pkg/col/typeconv/typeconv.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,13 @@ func TypeFamilyToCanonicalTypeFamily(family types.Family) types.Family {
switch family {
case types.BoolFamily:
return types.BoolFamily
case types.BytesFamily, types.StringFamily, types.UuidFamily, types.EncodedKeyFamily:
case types.BytesFamily, types.StringFamily, types.UuidFamily, types.EncodedKeyFamily, types.EnumFamily:
// Note that by using Bytes family as the canonical one for other type
// families we allow the execution engine to evaluate invalid operations
// (e.g. the concat binary operation between a UUID and an enum "has"
// the execution engine support). However, it's not a big deal since the
// type-checking for validity of operations is done before the query
// reaches the execution engine.
return types.BytesFamily
case types.DecimalFamily:
return types.DecimalFamily
Expand Down
9 changes: 9 additions & 0 deletions pkg/sql/colconv/datum_to_vec.eg.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

150 changes: 150 additions & 0 deletions pkg/sql/colconv/vec_to_datum.eg.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pkg/sql/colencoding/key_encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func decodeTableKeyToCol(
rkey, d, err = encoding.DecodeDecimalDescending(key, scratch[:0])
}
vecs.DecimalCols[colIdx][rowIdx] = d
case types.BytesFamily, types.StringFamily, types.UuidFamily:
case types.BytesFamily, types.StringFamily, types.UuidFamily, types.EnumFamily:
if dir == catpb.IndexColumn_ASC {
// We ask for the deep copy to be made so that scratch doesn't
// reference the memory of key - this allows us to return scratch
Expand All @@ -163,6 +163,7 @@ func decodeTableKeyToCol(
// GCed.
rkey, scratch, err = encoding.DecodeBytesAscendingDeepCopy(key, scratch[:0])
} else {
// DecodeBytesDescending always performs a deep copy.
rkey, scratch, err = encoding.DecodeBytesDescending(key, scratch[:0])
}
// Set() performs a deep copy, so it is safe to return the scratch slice
Expand Down Expand Up @@ -259,7 +260,7 @@ func UnmarshalColumnValueToCol(
vecs.Float64Cols[colIdx][rowIdx] = v
case types.DecimalFamily:
err = value.GetDecimalInto(&vecs.DecimalCols[colIdx][rowIdx])
case types.BytesFamily, types.StringFamily, types.UuidFamily:
case types.BytesFamily, types.StringFamily, types.UuidFamily, types.EnumFamily:
var v []byte
v, err = value.GetBytes()
vecs.BytesCols[colIdx].Set(rowIdx, v)
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/colencoding/value_encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func DecodeTableValueToCol(
// original buffer.
buf, b, err = encoding.DecodeBoolValue(origBuf)
vecs.BoolCols[colIdx][rowIdx] = b
case types.BytesFamily, types.StringFamily:
case types.BytesFamily, types.StringFamily, types.EnumFamily:
var data []byte
buf, data, err = encoding.DecodeUntaggedBytesValue(buf)
vecs.BytesCols[colIdx].Set(rowIdx, data)
Expand Down
Loading

0 comments on commit 54b950d

Please sign in to comment.