util/parquet: refactor random testing types

This change refactors randomized testing to use `randgen.RandType`. `randgen.RandType` is better as it takes into account all allowable types which can appear in CRDB (ex. array of tuple). The previous code only generated random types which are supported by the writer which leaves a gap when new types are added. Now, the code defaults to all types and filters out unsupported ones. The previous code also unnessarily duplicates code from `randgen`. For example, generating a random tuple can be done by calling one method in `randgen`. Generating a random tuple using the previous code would require more complex work. Informs: cockroachdb#99028 Epic: https://cockroachlabs.atlassian.net/browse/CRDB-15071 Release note: None
jayshrivastava · Jun 21, 2023 · 8c36045 · 8c36045
1 parent 2ce1d54
commit 8c36045
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 56 deletions.
diff --git a/pkg/sql/randgen/type.go b/pkg/sql/randgen/type.go
@@ -26,8 +26,8 @@ var (
 	// SeedTypes includes the following types that form the basis of randomly
 	// generated types:
 	//   - All scalar types, except UNKNOWN and ANY
-	//   - ARRAY of ANY, where the ANY will be replaced with one of the legal
-	//     array element types in RandType
+	//   - ARRAY of ANY and TUPLE of ANY, where the ANY will be replaced with
+	//     one of the legal array element types in RandType
 	//   - OIDVECTOR and INT2VECTOR types
 	SeedTypes []*types.T
 
@@ -126,26 +126,26 @@ func RandTypeFromSlice(rng *rand.Rand, typs []*types.T) *types.T {
 			return types.MakeArray(inner)
 		}
 		if typ.ArrayContents().Family() == types.TupleFamily {
-			// Generate tuples between 0 and 4 datums in length
-			len := rng.Intn(5)
-			contents := make([]*types.T, len)
-			for i := range contents {
-				contents[i] = RandTypeFromSlice(rng, typs)
-			}
-			return types.MakeArray(types.MakeTuple(contents))
+			return types.MakeArray(RandTupleFromSlice(rng, typs))
 		}
 	case types.TupleFamily:
-		// Generate tuples between 0 and 4 datums in length
-		len := rng.Intn(5)
-		contents := make([]*types.T, len)
-		for i := range contents {
-			contents[i] = RandTypeFromSlice(rng, typs)
-		}
-		return types.MakeTuple(contents)
+		return RandTupleFromSlice(rng, typs)
 	}
 	return typ
 }
 
+// RandTupleFromSlice returns a random tuple which has field chosen randomly
+// from the input slice of types.
+func RandTupleFromSlice(rng *rand.Rand, typs []*types.T) *types.T {
+	// Generate tuples between 0 and 4 datums in length
+	len := rng.Intn(5)
+	contents := make([]*types.T, len)
+	for i := range contents {
+		contents[i] = RandTypeFromSlice(rng, typs)
+	}
+	return types.MakeTuple(contents)
+}
+
 // RandColumnType returns a random type that is a legal column type (e.g. no
 // nested arrays or tuples).
 func RandColumnType(rng *rand.Rand) *types.T {

diff --git a/pkg/util/parquet/writer_bench_test.go b/pkg/util/parquet/writer_bench_test.go
@@ -15,6 +15,7 @@ import (
 	"os"
 	"testing"
 
+	"github.com/cockroachdb/cockroach/pkg/sql/randgen"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/stretchr/testify/require"
@@ -33,7 +34,10 @@ func BenchmarkParquetWriter(b *testing.B) {
 			require.NoError(b, err)
 
 			// Slice a single type out of supportedTypes.
-			sch := makeRandSchema(numCols, benchmarkTypes[i:i+1], rng)
+			sch := makeRandSchema(numCols,
+				func(rng *rand.Rand) *types.T {
+					return benchmarkTypes[i]
+				}, rng)
 			datums := makeRandDatums(1, sch, rng)
 
 			schemaDef, err := NewSchema(sch.columnNames, sch.columnTypes)
@@ -60,13 +64,16 @@ func BenchmarkParquetWriter(b *testing.B) {
 
 func getBenchmarkTypes() []*types.T {
 	var typs []*types.T
-	for _, typ := range supportedTypes {
+	// NB: This depends on randgen.SeedTypes containing all scalar
+	// types supported by the writer, and all the types below once.
+	for _, typ := range randgen.SeedTypes {
 		switch typ.Family() {
+		case types.AnyFamily, types.TSQueryFamily, types.TSVectorFamily,
+			types.VoidFamily, types.TupleFamily:
+			// Remove Any Tuple.
 		case types.ArrayFamily:
-			// Pick out one array type to benchmark arrays.
-			if typ.ArrayContents() == types.Int {
-				typs = append(typs, typ)
-			}
+			// Replace Any Array with Int Array.
+			typs = append(typs, types.IntArray)
 		default:
 			typs = append(typs, typ)
 		}

diff --git a/pkg/util/parquet/writer_test.go b/pkg/util/parquet/writer_test.go
@@ -45,41 +45,34 @@ func newColSchema(numCols int) *colSchema {
 	}
 }
 
-// supportedTypes contains all types supported by the writer,
-// which is all types that pass randomized testing below.
-var supportedTypes []*types.T
-
-func init() {
-	for _, typ := range randgen.SeedTypes {
-		switch typ.Family() {
-		// The types below are unsupported. They will fail randomized tests.
-		case types.AnyFamily:
-		case types.TSQueryFamily, types.TSVectorFamily:
-		case types.VoidFamily:
-		case types.TupleFamily:
-		case types.ArrayFamily:
-			// We will manually add array types which are supported below.
-			// Excluding types.TupleFamily and types.ArrayFamily leaves us with only
-			// scalar types so far.
-		default:
-			supportedTypes = append(supportedTypes, typ)
+// typSupported filters out types which can be returned by randgen.RandType
+// that are not supported by the writer.
+func typSupported(typ *types.T) bool {
+	switch typ.Family() {
+	case types.AnyFamily, types.TSQueryFamily, types.TSVectorFamily,
+		types.TupleFamily, types.VoidFamily:
+		return false
+	case types.ArrayFamily:
+		if typ.ArrayContents().Family() == types.ArrayFamily || typ.ArrayContents().Family() == types.TupleFamily {
+			return false
 		}
+		return typSupported(typ.ArrayContents())
+	default:
+		// It is better to let an unexpected type pass the filter and fail the test
+		// because we can observe and document such failures.
+		return true
 	}
+}
 
-	// randgen.SeedTypes does not include types.Json, so we add it manually here.
-	supportedTypes = append(supportedTypes, types.Json)
-
-	// Include all array types which are arrays of the scalar types above.
-	var arrayTypes []*types.T
-	for oid := range types.ArrayOids {
-		arrayTyp := types.OidToType[oid]
-		for _, typ := range supportedTypes {
-			if arrayTyp.InternalType.ArrayContents == typ {
-				arrayTypes = append(arrayTypes, arrayTyp)
-			}
-		}
+// randTestingType returns a random type for testing.
+func randTestingType(rng *rand.Rand) *types.T {
+	supported := false
+	var typ *types.T
+	for !supported {
+		typ = randgen.RandType(rng)
+		supported = typSupported(typ)
 	}
-	supportedTypes = append(supportedTypes, arrayTypes...)
+	return typ
 }
 
 func makeRandDatums(numRows int, sch *colSchema, rng *rand.Rand) [][]tree.Datum {
@@ -93,10 +86,12 @@ func makeRandDatums(numRows int, sch *colSchema, rng *rand.Rand) [][]tree.Datum
 	return datums
 }
 
-func makeRandSchema(numCols int, allowedTypes []*types.T, rng *rand.Rand) *colSchema {
+func makeRandSchema(
+	numCols int, randType func(rng *rand.Rand) *types.T, rng *rand.Rand,
+) *colSchema {
 	sch := newColSchema(numCols)
 	for i := 0; i < numCols; i++ {
-		sch.columnTypes[i] = allowedTypes[rng.Intn(len(allowedTypes))]
+		sch.columnTypes[i] = randType(rng)
 		sch.columnNames[i] = fmt.Sprintf("%s%d", sch.columnTypes[i].Name(), i)
 	}
 	return sch
@@ -111,7 +106,7 @@ func TestRandomDatums(t *testing.T) {
 	numCols := 128
 	maxRowGroupSize := int64(8)
 
-	sch := makeRandSchema(numCols, supportedTypes, rng)
+	sch := makeRandSchema(numCols, randTestingType, rng)
 	datums := makeRandDatums(numRows, sch, rng)
 
 	fileName := "TestRandomDatums.parquet"