From 996cf44cc3b653c63c824acd4e280af662dca4cb Mon Sep 17 00:00:00 2001 From: Rohan Yadav Date: Thu, 11 Jul 2019 11:12:45 -0400 Subject: [PATCH] exec: Add support for vectorized engine to use builtin functions. Added support for the vectorized engine to use all builtin functions that are supported by distsql. The mechanism for doing this is relatively ineffecient however -- it involves converting the needed row elements into datums, and the result back from a datum once the function application is complete. Future work could specialize these operators to specific functions to avoid these type switching operations. Implementation of this feature includes refactoring the materializer's batch to row code into a separate, reusable function. Release note: None --- pkg/sql/distsqlrun/column_exec_setup.go | 24 +++ pkg/sql/distsqlrun/materializer.go | 45 +---- pkg/sql/exec/builtin_funcs.go | 127 ++++++++++++++ pkg/sql/exec/builtin_funcs_test.go | 165 ++++++++++++++++++ pkg/sql/exec/coldata/vec_tmpl.go | 12 ++ pkg/sql/exec/vec_elem_to_datum.go | 65 +++++++ .../logictest/testdata/logic_test/vectorize | 36 ++++ pkg/sql/sem/tree/expr.go | 5 + 8 files changed, 436 insertions(+), 43 deletions(-) create mode 100644 pkg/sql/exec/builtin_funcs.go create mode 100644 pkg/sql/exec/builtin_funcs_test.go create mode 100644 pkg/sql/exec/vec_elem_to_datum.go diff --git a/pkg/sql/distsqlrun/column_exec_setup.go b/pkg/sql/distsqlrun/column_exec_setup.go index e7bf967915fd..096622778fea 100644 --- a/pkg/sql/distsqlrun/column_exec_setup.go +++ b/pkg/sql/distsqlrun/column_exec_setup.go @@ -647,6 +647,30 @@ func planProjectionOperators( return planProjectionExpr(ctx, t.Operator, t.TypedLeft(), t.TypedRight(), columnTypes, input) case *tree.BinaryExpr: return planProjectionExpr(ctx, t.Operator, t.TypedLeft(), t.TypedRight(), columnTypes, input) + case *tree.FuncExpr: + var ( + inputCols []int + projectionMem int + ) + ct = columnTypes + op = input + for _, e := range t.Exprs { + var err error + // TODO(rohany): This could be done better, especially in the case of + // constant arguments, because the vectorized engine right now + // creates a new column full of the constant value. + op, resultIdx, ct, projectionMem, err = planProjectionOperators(ctx, e.(tree.TypedExpr), ct, op) + if err != nil { + return nil, resultIdx, nil, memUsed, err + } + inputCols = append(inputCols, resultIdx) + memUsed += projectionMem + } + funcOutputType := t.ResolvedType() + resultIdx = len(ct) + ct = append(ct, *funcOutputType) + op = exec.NewBuiltinFunctionOperator(ctx, t, ct, inputCols, resultIdx, op) + return op, resultIdx, ct, memUsed, nil case tree.Datum: datumType := t.ResolvedType() ct := columnTypes diff --git a/pkg/sql/distsqlrun/materializer.go b/pkg/sql/distsqlrun/materializer.go index fe2dd72b5d1d..b6471708997f 100644 --- a/pkg/sql/distsqlrun/materializer.go +++ b/pkg/sql/distsqlrun/materializer.go @@ -12,8 +12,6 @@ package distsqlrun import ( "context" - "fmt" - "unsafe" "github.com/cockroachdb/cockroach/pkg/sql/distsqlpb" "github.com/cockroachdb/cockroach/pkg/sql/exec" @@ -21,8 +19,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" "github.com/cockroachdb/cockroach/pkg/sql/types" - "github.com/cockroachdb/cockroach/pkg/util/timeutil/pgdate" - "github.com/lib/pq/oid" ) // materializer converts an exec.Operator input into a RowSource. @@ -140,46 +136,9 @@ func (m *materializer) Next() (sqlbase.EncDatumRow, *distsqlpb.ProducerMetadata) continue } } - ct := typs[outIdx] - switch ct.Family() { - case types.BoolFamily: - if col.Bool()[rowIdx] { - m.row[outIdx].Datum = tree.DBoolTrue - } else { - m.row[outIdx].Datum = tree.DBoolFalse - } - case types.IntFamily: - switch ct.Width() { - case 8: - m.row[outIdx].Datum = m.da.NewDInt(tree.DInt(col.Int8()[rowIdx])) - case 16: - m.row[outIdx].Datum = m.da.NewDInt(tree.DInt(col.Int16()[rowIdx])) - case 32: - m.row[outIdx].Datum = m.da.NewDInt(tree.DInt(col.Int32()[rowIdx])) - default: - m.row[outIdx].Datum = m.da.NewDInt(tree.DInt(col.Int64()[rowIdx])) - } - case types.FloatFamily: - m.row[outIdx].Datum = m.da.NewDFloat(tree.DFloat(col.Float64()[rowIdx])) - case types.DecimalFamily: - m.row[outIdx].Datum = m.da.NewDDecimal(tree.DDecimal{Decimal: col.Decimal()[rowIdx]}) - case types.DateFamily: - m.row[outIdx].Datum = tree.NewDDate(pgdate.MakeCompatibleDateFromDisk(col.Int64()[rowIdx])) - case types.StringFamily: - b := col.Bytes()[rowIdx] - if ct.Oid() == oid.T_name { - m.row[outIdx].Datum = m.da.NewDString(tree.DString(*(*string)(unsafe.Pointer(&b)))) - } else { - m.row[outIdx].Datum = m.da.NewDName(tree.DString(*(*string)(unsafe.Pointer(&b)))) - } - case types.BytesFamily: - m.row[outIdx].Datum = m.da.NewDBytes(tree.DBytes(col.Bytes()[rowIdx])) - case types.OidFamily: - m.row[outIdx].Datum = m.da.NewDOid(tree.MakeDOid(tree.DInt(col.Int64()[rowIdx]))) - default: - panic(fmt.Sprintf("Unsupported column type %s", ct.String())) - } + + m.row[outIdx].Datum = exec.PhysicalTypeColElemToDatum(col, rowIdx, m.da, ct) } return m.ProcessRowHelper(m.row), nil } diff --git a/pkg/sql/exec/builtin_funcs.go b/pkg/sql/exec/builtin_funcs.go new file mode 100644 index 000000000000..14bfa606916c --- /dev/null +++ b/pkg/sql/exec/builtin_funcs.go @@ -0,0 +1,127 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package exec + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/exec/coldata" + "github.com/cockroachdb/cockroach/pkg/sql/exec/types" + "github.com/cockroachdb/cockroach/pkg/sql/exec/types/conv" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" + semtypes "github.com/cockroachdb/cockroach/pkg/sql/types" +) + +type defaultBuiltinFuncOperator struct { + input Operator + evalCtx *tree.EvalContext + funcExpr *tree.FuncExpr + columnTypes []semtypes.T + inputCols []int + outputIdx int + outputType *semtypes.T + outputPhysType types.T + converter func(tree.Datum) (interface{}, error) + + row tree.Datums + da sqlbase.DatumAlloc +} + +func (b *defaultBuiltinFuncOperator) Init() { + b.input.Init() +} + +func (b *defaultBuiltinFuncOperator) Next(ctx context.Context) coldata.Batch { + batch := b.input.Next(ctx) + n := batch.Length() + if n == 0 { + return batch + } + + if b.outputIdx == batch.Width() { + batch.AppendCol(b.outputPhysType) + } + + sel := batch.Selection() + for i := uint16(0); i < n; i++ { + rowIdx := i + if sel != nil { + rowIdx = sel[i] + } + + hasNulls := false + + for j := range b.inputCols { + col := batch.ColVec(b.inputCols[j]) + if col.MaybeHasNulls() && col.Nulls().NullAt(rowIdx) { + hasNulls = true + b.row[j] = tree.DNull + } else { + b.row[j] = PhysicalTypeColElemToDatum(col, rowIdx, b.da, b.columnTypes[b.inputCols[j]]) + } + } + + var ( + res tree.Datum + err error + ) + // Some functions cannot handle null arguments. + if hasNulls && !b.funcExpr.CanHandleNulls() { + res = tree.DNull + } else { + res, err = b.funcExpr.ResolvedOverload().Fn(b.evalCtx, b.row) + if err != nil { + panic(err) + } + } + + // Convert the datum into a physical type and write it out. + if res == tree.DNull { + batch.ColVec(b.outputIdx).Nulls().SetNull(uint16(rowIdx)) + } else { + converted, err := b.converter(res) + if err != nil { + panic(err) + } + coldata.SetValueAt(batch.ColVec(b.outputIdx), converted, rowIdx, b.outputPhysType) + } + } + return batch +} + +// NewBuiltinFunctionOperator returns an operator that applies builtin functions. +func NewBuiltinFunctionOperator( + evalCtx *tree.EvalContext, + funcExpr *tree.FuncExpr, + columnTypes []semtypes.T, + inputCols []int, + outputIdx int, + input Operator, +) Operator { + + outputType := funcExpr.ResolvedType() + + // For now, return the default builtin operator. Future work can specialize + // out the operators to efficient implementations of specific builtins. + return &defaultBuiltinFuncOperator{ + input: input, + evalCtx: evalCtx, + funcExpr: funcExpr, + outputIdx: outputIdx, + columnTypes: columnTypes, + outputType: outputType, + outputPhysType: conv.FromColumnType(outputType), + converter: conv.GetDatumToPhysicalFn(outputType), + row: make(tree.Datums, len(inputCols)), + inputCols: inputCols, + } +} diff --git a/pkg/sql/exec/builtin_funcs_test.go b/pkg/sql/exec/builtin_funcs_test.go new file mode 100644 index 000000000000..440962df53cd --- /dev/null +++ b/pkg/sql/exec/builtin_funcs_test.go @@ -0,0 +1,165 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package exec + +import ( + "context" + "fmt" + "math/rand" + "testing" + + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/exec/coldata" + "github.com/cockroachdb/cockroach/pkg/sql/exec/types" + "github.com/cockroachdb/cockroach/pkg/sql/parser" + "github.com/cockroachdb/cockroach/pkg/sql/sem/builtins" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + semtypes "github.com/cockroachdb/cockroach/pkg/sql/types" +) + +// Mock typing context for the typechecker. +type mockTypeContext struct { + typs []semtypes.T +} + +func (p *mockTypeContext) IndexedVarEval(idx int, ctx *tree.EvalContext) (tree.Datum, error) { + return tree.DNull.Eval(ctx) +} + +func (p *mockTypeContext) IndexedVarResolvedType(idx int) *semtypes.T { + return &p.typs[idx] +} + +func (p *mockTypeContext) IndexedVarNodeFormatter(idx int) tree.NodeFormatter { + n := tree.Name(fmt.Sprintf("$%d", idx)) + return &n +} + +func TestBasicBuiltinFunctions(t *testing.T) { + // Trick to get the init() for the builtins package to run. + _ = builtins.AllBuiltinNames + + testCases := []struct { + desc string + expr string + inputCols []int + inputTuples tuples + inputTypes []semtypes.T + outputTypes []semtypes.T + outputTuples tuples + }{ + { + desc: "Absolute value", + expr: "abs(@1)", + inputCols: []int{0}, + inputTuples: tuples{{1}, {-1}}, + inputTypes: []semtypes.T{*semtypes.Int}, + outputTuples: tuples{{1}, {1}}, + outputTypes: []semtypes.T{*semtypes.Int, *semtypes.Int}, + }, + { + desc: "String length", + expr: "length(@1)", + inputCols: []int{0}, + inputTuples: tuples{{"Hello"}, {"The"}}, + inputTypes: []semtypes.T{*semtypes.String}, + outputTuples: tuples{{5}, {3}}, + outputTypes: []semtypes.T{*semtypes.String, *semtypes.Int}, + }, + } + + tctx := tree.NewTestingEvalContext(cluster.MakeTestingClusterSettings()) + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + runTests(t, []tuples{tc.inputTuples}, tc.outputTuples, orderedVerifier, []int{1}, + func(input []Operator) (Operator, error) { + expr, err := parser.ParseExpr(tc.expr) + if err != nil { + t.Fatal(err) + } + + p := &mockTypeContext{typs: tc.inputTypes} + typedExpr, err := tree.TypeCheck(expr, &tree.SemaContext{IVarContainer: p}, semtypes.Any) + if err != nil { + t.Fatal(err) + } + + return NewBuiltinFunctionOperator(tctx, typedExpr.(*tree.FuncExpr), tc.outputTypes, tc.inputCols, 1, input[0]), nil + }) + }) + } +} + +func benchmarkBuiltinFunctions(b *testing.B, useSelectionVector bool, hasNulls bool) { + ctx := context.Background() + tctx := tree.NewTestingEvalContext(cluster.MakeTestingClusterSettings()) + + batch := coldata.NewMemBatch([]types.T{types.Int64}) + col := batch.ColVec(0).Int64() + + for i := int64(0); i < coldata.BatchSize; i++ { + if float64(i) < coldata.BatchSize*selectivity { + col[i] = -1 + } else { + col[i] = 1 + } + } + + if hasNulls { + for i := 0; i < coldata.BatchSize; i++ { + if rand.Float64() < nullProbability { + batch.ColVec(0).Nulls().SetNull(uint16(i)) + } + } + } + + batch.SetLength(coldata.BatchSize) + + if useSelectionVector { + batch.SetSelection(true) + sel := batch.Selection() + for i := int64(0); i < coldata.BatchSize; i++ { + sel[i] = uint16(i) + } + } + + source := NewRepeatableBatchSource(batch) + source.Init() + + expr, err := parser.ParseExpr("abs(@1)") + if err != nil { + b.Fatal(err) + } + p := &mockTypeContext{typs: []semtypes.T{*semtypes.Int}} + typedExpr, err := tree.TypeCheck(expr, &tree.SemaContext{IVarContainer: p}, semtypes.Any) + if err != nil { + b.Fatal(err) + } + op := NewBuiltinFunctionOperator(tctx, typedExpr.(*tree.FuncExpr), []semtypes.T{*semtypes.Int}, []int{0}, 1, source) + + b.SetBytes(int64(8 * coldata.BatchSize)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + op.Next(ctx) + } +} + +func BenchmarkBuiltinFunctions(b *testing.B) { + _ = builtins.AllBuiltinNames + for _, useSel := range []bool{true, false} { + for _, hasNulls := range []bool{true, false} { + b.Run(fmt.Sprintf("useSel=%t,hasNulls=%t", useSel, hasNulls), func(b *testing.B) { + benchmarkBuiltinFunctions(b, useSel, hasNulls) + }) + } + } +} diff --git a/pkg/sql/exec/coldata/vec_tmpl.go b/pkg/sql/exec/coldata/vec_tmpl.go index 847e64973491..b3b08c7f7df4 100644 --- a/pkg/sql/exec/coldata/vec_tmpl.go +++ b/pkg/sql/exec/coldata/vec_tmpl.go @@ -190,3 +190,15 @@ func (m *memColumn) PrettyValueAt(colIdx uint16, colType types.T) string { panic(fmt.Sprintf("unhandled type %d", colType)) } } + +// Helper to set the value in a Vec when the type is unknown. +func SetValueAt(v Vec, elem interface{}, rowIdx uint16, colType types.T) { + switch colType { + // {{range .}} + case _TYPES_T: + v._TemplateType()[rowIdx] = elem.(_GOTYPE) + // {{end}} + default: + panic(fmt.Sprintf("unhandled type %d", colType)) + } +} diff --git a/pkg/sql/exec/vec_elem_to_datum.go b/pkg/sql/exec/vec_elem_to_datum.go new file mode 100644 index 000000000000..3b661edefad0 --- /dev/null +++ b/pkg/sql/exec/vec_elem_to_datum.go @@ -0,0 +1,65 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package exec + +import ( + "fmt" + "unsafe" + + "github.com/cockroachdb/cockroach/pkg/sql/exec/coldata" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" + semtypes "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util/timeutil/pgdate" + "github.com/lib/pq/oid" +) + +// PhysicalTypeColElemToDatum converts an element in a colvec to a datum of semtype ct. +func PhysicalTypeColElemToDatum( + col coldata.Vec, rowIdx uint16, da sqlbase.DatumAlloc, ct semtypes.T, +) tree.Datum { + switch ct.Family() { + case semtypes.BoolFamily: + if col.Bool()[rowIdx] { + return tree.DBoolTrue + } + return tree.DBoolFalse + case semtypes.IntFamily: + switch ct.Width() { + case 8: + return da.NewDInt(tree.DInt(col.Int8()[rowIdx])) + case 16: + return da.NewDInt(tree.DInt(col.Int16()[rowIdx])) + case 32: + return da.NewDInt(tree.DInt(col.Int32()[rowIdx])) + default: + return da.NewDInt(tree.DInt(col.Int64()[rowIdx])) + } + case semtypes.FloatFamily: + return da.NewDFloat(tree.DFloat(col.Float64()[rowIdx])) + case semtypes.DecimalFamily: + return da.NewDDecimal(tree.DDecimal{Decimal: col.Decimal()[rowIdx]}) + case semtypes.DateFamily: + return tree.NewDDate(pgdate.MakeCompatibleDateFromDisk(col.Int64()[rowIdx])) + case semtypes.StringFamily: + b := col.Bytes()[rowIdx] + if ct.Oid() == oid.T_name { + return da.NewDName(tree.DString(*(*string)(unsafe.Pointer(&b)))) + } + return da.NewDString(tree.DString(*(*string)(unsafe.Pointer(&b)))) + case semtypes.BytesFamily: + return da.NewDBytes(tree.DBytes(col.Bytes()[rowIdx])) + case semtypes.OidFamily: + return da.NewDOid(tree.MakeDOid(tree.DInt(col.Int64()[rowIdx]))) + default: + panic(fmt.Sprintf("Unsupported column type %s", ct.String())) + } +} diff --git a/pkg/sql/logictest/testdata/logic_test/vectorize b/pkg/sql/logictest/testdata/logic_test/vectorize index 59d77558d3d9..4fdba2fd09e4 100644 --- a/pkg/sql/logictest/testdata/logic_test/vectorize +++ b/pkg/sql/logictest/testdata/logic_test/vectorize @@ -479,3 +479,39 @@ query III SELECT 0, 1 + 2, 3 * 4 FROM a HAVING true ---- 0 3 12 + +# Testing some builtin functions. +statement ok +CREATE TABLE builtin_test (x STRING, y INT) + +statement ok +INSERT INTO builtin_test VALUES ('Hello', 3), ('There', 2) + +query T +SELECT substring(x, 1, y) FROM builtin_test +---- +Hel +Th + +query T +SELECT substring(x, 1, abs(y)) FROM builtin_test +---- +Hel +Th + +query I +SELECT abs(y) FROM builtin_test +---- +3 +2 + +statement ok +CREATE TABLE extract_test (x DATE) + +statement ok +INSERT INTO extract_test VALUES ('2017-01-01') + +query I +SELECT EXTRACT(YEAR FROM x) FROM extract_test +---- +2017 diff --git a/pkg/sql/sem/tree/expr.go b/pkg/sql/sem/tree/expr.go index 1a29fb04e2d1..b5a31f38d932 100644 --- a/pkg/sql/sem/tree/expr.go +++ b/pkg/sql/sem/tree/expr.go @@ -1324,6 +1324,11 @@ func (node *FuncExpr) IsDistSQLBlacklist() bool { return node.fnProps != nil && node.fnProps.DistsqlBlacklist } +// CanHandleNulls returns whether or not +func (node *FuncExpr) CanHandleNulls() bool { + return node.fnProps != nil && node.fnProps.NullableArgs +} + type funcType int // FuncExpr.Type