Skip to content

Commit

Permalink
util/parquet: refactor random testing types
Browse files Browse the repository at this point in the history
This change refactors randomized testing to use `randgen.RandType`.
`randgen.RandType` is better as it takes into account all allowable
types which can appear in CRDB (ex. array of tuple). The previous
code only generated random types which are supported by the writer
which leaves a gap when new types are added. Now, the code defaults
to all types and filters out unsupported ones.

The previous code also unnessarily duplicates code from `randgen`.
For example, generating a random tuple can be done by calling one
method in `randgen`. Generating a random tuple using the
previous code would require more complex work.

Informs: cockroachdb#99028
Epic: https://cockroachlabs.atlassian.net/browse/CRDB-15071
Release note: None
  • Loading branch information
jayshrivastava committed Jun 21, 2023
1 parent 2ce1d54 commit 8c36045
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 56 deletions.
32 changes: 16 additions & 16 deletions pkg/sql/randgen/type.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ var (
// SeedTypes includes the following types that form the basis of randomly
// generated types:
// - All scalar types, except UNKNOWN and ANY
// - ARRAY of ANY, where the ANY will be replaced with one of the legal
// array element types in RandType
// - ARRAY of ANY and TUPLE of ANY, where the ANY will be replaced with
// one of the legal array element types in RandType
// - OIDVECTOR and INT2VECTOR types
SeedTypes []*types.T

Expand Down Expand Up @@ -126,26 +126,26 @@ func RandTypeFromSlice(rng *rand.Rand, typs []*types.T) *types.T {
return types.MakeArray(inner)
}
if typ.ArrayContents().Family() == types.TupleFamily {
// Generate tuples between 0 and 4 datums in length
len := rng.Intn(5)
contents := make([]*types.T, len)
for i := range contents {
contents[i] = RandTypeFromSlice(rng, typs)
}
return types.MakeArray(types.MakeTuple(contents))
return types.MakeArray(RandTupleFromSlice(rng, typs))
}
case types.TupleFamily:
// Generate tuples between 0 and 4 datums in length
len := rng.Intn(5)
contents := make([]*types.T, len)
for i := range contents {
contents[i] = RandTypeFromSlice(rng, typs)
}
return types.MakeTuple(contents)
return RandTupleFromSlice(rng, typs)
}
return typ
}

// RandTupleFromSlice returns a random tuple which has field chosen randomly
// from the input slice of types.
func RandTupleFromSlice(rng *rand.Rand, typs []*types.T) *types.T {
// Generate tuples between 0 and 4 datums in length
len := rng.Intn(5)
contents := make([]*types.T, len)
for i := range contents {
contents[i] = RandTypeFromSlice(rng, typs)
}
return types.MakeTuple(contents)
}

// RandColumnType returns a random type that is a legal column type (e.g. no
// nested arrays or tuples).
func RandColumnType(rng *rand.Rand) *types.T {
Expand Down
19 changes: 13 additions & 6 deletions pkg/util/parquet/writer_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"os"
"testing"

"github.com/cockroachdb/cockroach/pkg/sql/randgen"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/stretchr/testify/require"
Expand All @@ -33,7 +34,10 @@ func BenchmarkParquetWriter(b *testing.B) {
require.NoError(b, err)

// Slice a single type out of supportedTypes.
sch := makeRandSchema(numCols, benchmarkTypes[i:i+1], rng)
sch := makeRandSchema(numCols,
func(rng *rand.Rand) *types.T {
return benchmarkTypes[i]
}, rng)
datums := makeRandDatums(1, sch, rng)

schemaDef, err := NewSchema(sch.columnNames, sch.columnTypes)
Expand All @@ -60,13 +64,16 @@ func BenchmarkParquetWriter(b *testing.B) {

func getBenchmarkTypes() []*types.T {
var typs []*types.T
for _, typ := range supportedTypes {
// NB: This depends on randgen.SeedTypes containing all scalar
// types supported by the writer, and all the types below once.
for _, typ := range randgen.SeedTypes {
switch typ.Family() {
case types.AnyFamily, types.TSQueryFamily, types.TSVectorFamily,
types.VoidFamily, types.TupleFamily:
// Remove Any Tuple.
case types.ArrayFamily:
// Pick out one array type to benchmark arrays.
if typ.ArrayContents() == types.Int {
typs = append(typs, typ)
}
// Replace Any Array with Int Array.
typs = append(typs, types.IntArray)
default:
typs = append(typs, typ)
}
Expand Down
63 changes: 29 additions & 34 deletions pkg/util/parquet/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,41 +45,34 @@ func newColSchema(numCols int) *colSchema {
}
}

// supportedTypes contains all types supported by the writer,
// which is all types that pass randomized testing below.
var supportedTypes []*types.T

func init() {
for _, typ := range randgen.SeedTypes {
switch typ.Family() {
// The types below are unsupported. They will fail randomized tests.
case types.AnyFamily:
case types.TSQueryFamily, types.TSVectorFamily:
case types.VoidFamily:
case types.TupleFamily:
case types.ArrayFamily:
// We will manually add array types which are supported below.
// Excluding types.TupleFamily and types.ArrayFamily leaves us with only
// scalar types so far.
default:
supportedTypes = append(supportedTypes, typ)
// typSupported filters out types which can be returned by randgen.RandType
// that are not supported by the writer.
func typSupported(typ *types.T) bool {
switch typ.Family() {
case types.AnyFamily, types.TSQueryFamily, types.TSVectorFamily,
types.TupleFamily, types.VoidFamily:
return false
case types.ArrayFamily:
if typ.ArrayContents().Family() == types.ArrayFamily || typ.ArrayContents().Family() == types.TupleFamily {
return false
}
return typSupported(typ.ArrayContents())
default:
// It is better to let an unexpected type pass the filter and fail the test
// because we can observe and document such failures.
return true
}
}

// randgen.SeedTypes does not include types.Json, so we add it manually here.
supportedTypes = append(supportedTypes, types.Json)

// Include all array types which are arrays of the scalar types above.
var arrayTypes []*types.T
for oid := range types.ArrayOids {
arrayTyp := types.OidToType[oid]
for _, typ := range supportedTypes {
if arrayTyp.InternalType.ArrayContents == typ {
arrayTypes = append(arrayTypes, arrayTyp)
}
}
// randTestingType returns a random type for testing.
func randTestingType(rng *rand.Rand) *types.T {
supported := false
var typ *types.T
for !supported {
typ = randgen.RandType(rng)
supported = typSupported(typ)
}
supportedTypes = append(supportedTypes, arrayTypes...)
return typ
}

func makeRandDatums(numRows int, sch *colSchema, rng *rand.Rand) [][]tree.Datum {
Expand All @@ -93,10 +86,12 @@ func makeRandDatums(numRows int, sch *colSchema, rng *rand.Rand) [][]tree.Datum
return datums
}

func makeRandSchema(numCols int, allowedTypes []*types.T, rng *rand.Rand) *colSchema {
func makeRandSchema(
numCols int, randType func(rng *rand.Rand) *types.T, rng *rand.Rand,
) *colSchema {
sch := newColSchema(numCols)
for i := 0; i < numCols; i++ {
sch.columnTypes[i] = allowedTypes[rng.Intn(len(allowedTypes))]
sch.columnTypes[i] = randType(rng)
sch.columnNames[i] = fmt.Sprintf("%s%d", sch.columnTypes[i].Name(), i)
}
return sch
Expand All @@ -111,7 +106,7 @@ func TestRandomDatums(t *testing.T) {
numCols := 128
maxRowGroupSize := int64(8)

sch := makeRandSchema(numCols, supportedTypes, rng)
sch := makeRandSchema(numCols, randTestingType, rng)
datums := makeRandDatums(numRows, sch, rng)

fileName := "TestRandomDatums.parquet"
Expand Down

0 comments on commit 8c36045

Please sign in to comment.