schema: data sizes are configurable

The data sizes are configurable through the SchemaConfig object. It propagates most of config through to the runtime at the initial schema generation. However the sizes of actual data such as strings and blobs are carried through to the GenValue functions via the PartitionRange objects. Default data set size is "large" which is suitable for a large actual testing setup and the development scripts is using the new switch "dataset-size" to set it to "small" during development.
scylladb · Jun 20, 2019 · ac43aea · ac43aea
1 parent 4681919
commit ac43aea
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 60 deletions.
diff --git a/cmd/gemini/root.go b/cmd/gemini/root.go
@@ -47,6 +47,7 @@ var (
 	maxPartitionKeys   int
 	maxClusteringKeys  int
 	maxColumns         int
+	datasetSize        string
 )
 
 const (
@@ -184,6 +185,7 @@ func run(cmd *cobra.Command, args []string) {
 	}
 	defer outFile.Sync()
 
+	schemaConfig := createSchemaConfig()
 	var schema *gemini.Schema
 	if len(schemaFile) > 0 {
 		var err error
@@ -193,13 +195,7 @@ func run(cmd *cobra.Command, args []string) {
 			return
 		}
 	} else {
-		sc := gemini.SchemaConfig{
-			CompactionStrategy: getCompactionStrategy(compactionStrategy),
-			MaxPartitionKeys:   maxPartitionKeys,
-			MaxClusteringKeys:  maxClusteringKeys,
-			MaxColumns:         maxColumns,
-		}
-		schema = gemini.GenSchema(sc)
+		schema = gemini.GenSchema(schemaConfig)
 	}
 
 	jsonSchema, _ := json.MarshalIndent(schema, "", "    ")
@@ -230,7 +226,49 @@ func run(cmd *cobra.Command, args []string) {
 		}
 	}
 
-	runJob(Job, schema, store, mode, outFile)
+	runJob(Job, schema, schemaConfig, store, mode, outFile)
+}
+
+func createSchemaConfig() *gemini.SchemaConfig {
+	defaultConfig := createDefaultSchemaConfig()
+	switch strings.ToLower(datasetSize) {
+	case "small":
+		return &gemini.SchemaConfig{
+			CompactionStrategy: defaultConfig.CompactionStrategy,
+			MaxPartitionKeys:   defaultConfig.MaxPartitionKeys,
+			MaxClusteringKeys:  defaultConfig.MaxClusteringKeys,
+			MaxColumns:         defaultConfig.MaxColumns,
+			MaxUDTParts:        2,
+			MaxTupleParts:      2,
+			MaxBlobLength:      20,
+			MaxStringLength:    20,
+		}
+	default:
+		return defaultConfig
+	}
+}
+
+func createDefaultSchemaConfig() *gemini.SchemaConfig {
+	const (
+		MaxBlobLength   = 1e4
+		MinBlobLength   = 0
+		MaxStringLength = 1000
+		MinStringLength = 0
+		MaxTupleParts   = 20
+		MaxUDTParts     = 20
+	)
+	return &gemini.SchemaConfig{
+		CompactionStrategy: getCompactionStrategy(compactionStrategy),
+		MaxPartitionKeys:   maxPartitionKeys,
+		MaxClusteringKeys:  maxClusteringKeys,
+		MaxColumns:         maxColumns,
+		MaxUDTParts:        MaxUDTParts,
+		MaxTupleParts:      MaxTupleParts,
+		MaxBlobLength:      MaxBlobLength,
+		MinBlobLength:      MinBlobLength,
+		MaxStringLength:    MaxStringLength,
+		MinStringLength:    MinStringLength,
+	}
 }
 
 func createClusters(consistency gocql.Consistency) (*gocql.ClusterConfig, *gocql.ClusterConfig) {
@@ -270,7 +308,7 @@ func getCompactionStrategy(cs string) *gemini.CompactionStrategy {
 	}
 }
 
-func runJob(f testJob, schema *gemini.Schema, s store.Store, mode string, out *os.File) {
+func runJob(f testJob, schema *gemini.Schema, schemaConfig *gemini.SchemaConfig, s store.Store, mode string, out *os.File) {
 	defer out.Sync()
 	c := make(chan Status, 10000)
 	minRange := 0
@@ -290,9 +328,13 @@ func runJob(f testJob, schema *gemini.Schema, s store.Store, mode string, out *o
 	for _, table := range schema.Tables {
 		for i := 0; i < concurrency; i++ {
 			p := gemini.PartitionRange{
-				Min:  minRange + i*maxRange,
-				Max:  maxRange + i*maxRange,
-				Rand: rand.New(rand.NewSource(int64(seed))),
+				Min:             minRange + i*maxRange,
+				Max:             maxRange + i*maxRange,
+				Rand:            rand.New(rand.NewSource(int64(seed))),
+				MaxBlobLength:   schemaConfig.MaxBlobLength,
+				MinBlobLength:   schemaConfig.MinBlobLength,
+				MaxStringLength: schemaConfig.MaxStringLength,
+				MinStringLength: schemaConfig.MinStringLength,
 			}
 			go f(workerCtx, pump.ch, &workers, schema, table, s, p, c, mode, out, warmup)
 		}
@@ -474,6 +516,7 @@ func init() {
 	rootCmd.Flags().IntVarP(&maxPartitionKeys, "max-partition-keys", "", 2, "Maximum number of generated partition keys")
 	rootCmd.Flags().IntVarP(&maxClusteringKeys, "max-clustering-keys", "", 4, "Maximum number of generated clustering keys")
 	rootCmd.Flags().IntVarP(&maxColumns, "max-columns", "", 16, "Maximum number of generated columns")
+	rootCmd.Flags().StringVarP(&datasetSize, "dataset-size", "", "large", "Specify the type of dataset size to use, small|large")
 }
 
 func printSetup() error {

diff --git a/schema.go b/schema.go
@@ -20,6 +20,12 @@ type SchemaConfig struct {
 	MaxPartitionKeys   int
 	MaxClusteringKeys  int
 	MaxColumns         int
+	MaxUDTParts        int
+	MaxTupleParts      int
+	MaxBlobLength      int
+	MaxStringLength    int
+	MinBlobLength      int
+	MinStringLength    int
 }
 
 type Keyspace struct {
@@ -178,9 +184,13 @@ type Schema struct {
 }
 
 type PartitionRange struct {
-	Min  int `default:0`
-	Max  int `default:100`
-	Rand *rand.Rand
+	Min             int `default:0`
+	Max             int `default:100`
+	Rand            *rand.Rand
+	MaxBlobLength   int
+	MinBlobLength   int
+	MaxStringLength int
+	MinStringLength int
 }
 
 func (s *Schema) GetDropSchema() []string {
@@ -189,7 +199,7 @@ func (s *Schema) GetDropSchema() []string {
 	}
 }
 
-func GenSchema(sc SchemaConfig) *Schema {
+func GenSchema(sc *SchemaConfig) *Schema {
 	builder := NewSchemaBuilder()
 	keyspace := Keyspace{
 		Name: "ks1",
@@ -208,7 +218,7 @@ func GenSchema(sc SchemaConfig) *Schema {
 	var columns []ColumnDef
 	numColumns := rand.Intn(sc.MaxColumns)
 	for i := 0; i < numColumns; i++ {
-		columns = append(columns, ColumnDef{Name: genColumnName("col", i), Type: genColumnType(numColumns)})
+		columns = append(columns, ColumnDef{Name: genColumnName("col", i), Type: genColumnType(numColumns, sc)})
 	}
 	var indexes []IndexDef
 	if numColumns > 0 {

diff --git a/scripts/gemini-launcher b/scripts/gemini-launcher
@@ -26,6 +26,7 @@ until docker logs ${TEST_NAME} | grep "Starting listening for CQL clients" > /de
 $GEMINI_CMD \
 	--duration=10m \
 	--fail-fast \
+	--dataset-size=small \
 	--test-cluster=${TEST_IP} \
 	--oracle-cluster=${ORACLE_IP} \
 	"$@"

diff --git a/types.go b/types.go
@@ -38,13 +38,6 @@ const (
 	TYPE_UUID      = SimpleType("uuid")
 	TYPE_VARCHAR   = SimpleType("varchar")
 	TYPE_VARINT    = SimpleType("varint")
-
-	MaxBlobLength   = 1e4
-	MinBlobLength   = 0
-	MaxStringLength = 1000
-	MinStringLength = 0
-	MaxTupleParts   = 20
-	MaxUDTParts     = 20
 )
 
 // TODO: Add support for time when gocql bug is fixed.
@@ -117,10 +110,10 @@ func (st SimpleType) GenValue(p *PartitionRange) []interface{} {
 	var val interface{}
 	switch st {
 	case TYPE_ASCII, TYPE_TEXT, TYPE_VARCHAR:
-		ln := p.Rand.Intn(MaxStringLength) + MinStringLength
+		ln := p.Rand.Intn(p.MaxStringLength) + p.MinStringLength
 		val = randStringWithTime(p.Rand, ln, randTime(p.Rand))
 	case TYPE_BLOB:
-		ln := p.Rand.Intn(MaxBlobLength) + MinBlobLength
+		ln := p.Rand.Intn(p.MaxBlobLength) + p.MinBlobLength
 		val = hex.EncodeToString([]byte(randStringWithTime(p.Rand, ln, randTime(p.Rand))))
 	case TYPE_BIGINT:
 		val = p.Rand.Int63()
@@ -517,50 +510,50 @@ func genColumnName(prefix string, idx int) string {
 	return fmt.Sprintf("%s%d", prefix, idx)
 }
 
-func genColumnType(numColumns int) Type {
+func genColumnType(numColumns int, sc *SchemaConfig) Type {
 	n := rand.Intn(numColumns + 5)
 	switch n {
 	case numColumns:
-		return genTupleType()
+		return genTupleType(sc)
 	case numColumns + 1:
-		return genUDTType()
+		return genUDTType(sc)
 	case numColumns + 2:
-		return genSetType()
+		return genSetType(sc)
 	case numColumns + 3:
-		return genListType()
+		return genListType(sc)
 	case numColumns + 4:
-		return genMapType()
+		return genMapType(sc)
 	default:
-		return genSimpleType()
+		return genSimpleType(sc)
 	}
 }
 
-func genSimpleType() SimpleType {
+func genSimpleType(sc *SchemaConfig) SimpleType {
 	return types[rand.Intn(len(types))]
 }
 
-func genTupleType() Type {
-	n := rand.Intn(MaxTupleParts)
+func genTupleType(sc *SchemaConfig) Type {
+	n := rand.Intn(sc.MaxTupleParts)
 	if n < 2 {
 		n = 2
 	}
 	typeList := make([]SimpleType, n, n)
 	for i := 0; i < n; i++ {
-		typeList[i] = genSimpleType()
+		typeList[i] = genSimpleType(sc)
 	}
 	return TupleType{
 		Types:  typeList,
 		Frozen: rand.Uint32()%2 == 0,
 	}
 }
 
-func genUDTType() UDTType {
+func genUDTType(sc *SchemaConfig) UDTType {
 	udtNum := rand.Uint32()
 	typeName := fmt.Sprintf("udt_%d", udtNum)
 	ts := make(map[string]SimpleType)
 
-	for i := 0; i < rand.Intn(MaxUDTParts)+1; i++ {
-		ts[typeName+fmt.Sprintf("_%d", i)] = genSimpleType()
+	for i := 0; i < rand.Intn(sc.MaxUDTParts)+1; i++ {
+		ts[typeName+fmt.Sprintf("_%d", i)] = genSimpleType(sc)
 	}
 
 	return UDTType{
@@ -570,18 +563,18 @@ func genUDTType() UDTType {
 	}
 }
 
-func genSetType() BagType {
-	return genBagType("set")
+func genSetType(sc *SchemaConfig) BagType {
+	return genBagType("set", sc)
 }
 
-func genListType() BagType {
-	return genBagType("list")
+func genListType(sc *SchemaConfig) BagType {
+	return genBagType("list", sc)
 }
 
-func genBagType(kind string) BagType {
+func genBagType(kind string, sc *SchemaConfig) BagType {
 	var t SimpleType
 	for {
-		t = genSimpleType()
+		t = genSimpleType(sc)
 		if t != TYPE_DURATION {
 			break
 		}
@@ -593,17 +586,17 @@ func genBagType(kind string) BagType {
 	}
 }
 
-func genMapType() MapType {
+func genMapType(sc *SchemaConfig) MapType {
 	var t SimpleType
 	for {
-		t = genSimpleType()
+		t = genSimpleType(sc)
 		if t != TYPE_DURATION {
 			break
 		}
 	}
 	return MapType{
 		KeyType:   t,
-		ValueType: genSimpleType(),
+		ValueType: genSimpleType(sc),
 		Frozen:    rand.Uint32()%2 == 0,
 	}
 }

diff --git a/types_test.go b/types_test.go
@@ -211,26 +211,30 @@ func TestCQLPretty(t *testing.T) {
 }
 
 func TestMarshalUnmarshal(t *testing.T) {
+	sc := &SchemaConfig{
+		MaxTupleParts: 2,
+		MaxUDTParts:   2,
+	}
 	columns := Columns{
 		{
 			Name: genColumnName("col", 0),
-			Type: genMapType(),
+			Type: genMapType(sc),
 		},
 		{
 			Name: genColumnName("col", 1),
-			Type: genSetType(),
+			Type: genSetType(sc),
 		},
 		{
 			Name: genColumnName("col", 2),
-			Type: genListType(),
+			Type: genListType(sc),
 		},
 		{
 			Name: genColumnName("col", 3),
-			Type: genTupleType(),
+			Type: genTupleType(sc),
 		},
 		{
 			Name: genColumnName("col", 4),
-			Type: genUDTType(),
+			Type: genUDTType(sc),
 		},
 	}
 	s1 := &Schema{
@@ -240,13 +244,13 @@ func TestMarshalUnmarshal(t *testing.T) {
 				PartitionKeys: Columns{
 					{
 						Name: genColumnName("pk", 0),
-						Type: genSimpleType(),
+						Type: genSimpleType(sc),
 					},
 				},
 				ClusteringKeys: Columns{
 					{
 						Name: genColumnName("ck", 0),
-						Type: genSimpleType(),
+						Type: genSimpleType(sc),
 					},
 				},
 				Columns: columns,
@@ -266,21 +270,21 @@ func TestMarshalUnmarshal(t *testing.T) {
 						PartitionKeys: []ColumnDef{
 							{
 								Name: "pk_mv_0",
-								Type: genListType(),
+								Type: genListType(sc),
 							},
 							{
 								Name: "pk_mv_1",
-								Type: genTupleType(),
+								Type: genTupleType(sc),
 							},
 						},
 						ClusteringKeys: []ColumnDef{
 							{
 								Name: "ck_mv_0",
-								Type: genSetType(),
+								Type: genSetType(sc),
 							},
 							{
 								Name: "ck_mv_1",
-								Type: genUDTType(),
+								Type: genUDTType(sc),
 							},
 						},
 					},