From 4539072e48942b77721b1e3044235c1013266dc4 Mon Sep 17 00:00:00 2001 From: rharding6373 Date: Thu, 23 Feb 2023 13:08:25 -0800 Subject: [PATCH] sql: fix record-returning udfs when used as data source When record-returning UDFs (both implicit and `RECORD` return types) are used as a data source in a query, the result should be treated as a row with separate columns instead of a tuple, which is how UDF output is normally treated. This PR closes this gap between CRDB and Postgres. For example: ``` CREATE FUNCTION f() RETURNS RECORD AS $$ SELECT 1, 2, 3; $$ LANGUAGE SQL SELECT f() f -------- (1,2,3) SELECT * FROM f() as foo(a int, b int, c int); a | b | c ---+---+--- 1 | 2 | 3 ``` The behavior is the same for implicit record return types. Epic: CRDB-19496 Fixes: #97059 Release note (bug fix): Fixes the behavior of UDFs to return its results as a row instead of a tuple when UDFs are called in a query as a data source. This is now compatible with postgres behavior. --- pkg/sql/logictest/testdata/logic_test/enums | 2 +- .../logictest/testdata/logic_test/udf_record | 254 +++++++++++++++++- .../logictest/testdata/logic_test/udf_setof | 35 ++- pkg/sql/opt/exec/execbuilder/scalar.go | 8 + pkg/sql/opt/norm/inline_funcs.go | 8 +- pkg/sql/opt/norm/testdata/rules/inline | 24 +- pkg/sql/opt/norm/testdata/rules/udf | 31 +-- pkg/sql/opt/ops/scalar.opt | 5 + pkg/sql/opt/optbuilder/builder.go | 3 + pkg/sql/opt/optbuilder/scalar.go | 99 +++++-- pkg/sql/opt/optbuilder/select.go | 12 +- pkg/sql/opt/optbuilder/srfs.go | 4 +- pkg/sql/opt/optbuilder/testdata/udf | 67 +++++ pkg/sql/opt/optbuilder/util.go | 2 +- pkg/sql/routine.go | 13 +- pkg/sql/rowexec/project_set.go | 8 + pkg/sql/sem/builtins/generator_builtins.go | 14 + pkg/sql/sem/eval/generators.go | 2 +- pkg/sql/sem/tree/expr.go | 4 + pkg/sql/sem/tree/routine.go | 10 + pkg/sql/sem/tree/type_check.go | 7 +- 21 files changed, 535 insertions(+), 77 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/enums b/pkg/sql/logictest/testdata/logic_test/enums index bcfe55675a0f..446aa2c02acf 100644 --- a/pkg/sql/logictest/testdata/logic_test/enums +++ b/pkg/sql/logictest/testdata/logic_test/enums @@ -1768,7 +1768,7 @@ $$; query TT SELECT "🙏"('😊'), "🙏"(NULL:::"Emoji 😉") ---- -NULL NULL +NULL mixed statement ok CREATE DATABASE "DB➕➕"; diff --git a/pkg/sql/logictest/testdata/logic_test/udf_record b/pkg/sql/logictest/testdata/logic_test/udf_record index 74c7b7efa966..2bc96e48f670 100644 --- a/pkg/sql/logictest/testdata/logic_test/udf_record +++ b/pkg/sql/logictest/testdata/logic_test/udf_record @@ -13,15 +13,13 @@ SELECT f_one(); ---- (1) -# TODO(#97059): The following query should require a column definition list. -statement ok +statement error pq: a column definition list is required for functions returning \"record\" SELECT * FROM f_one(); -# TODO(#97059): The following query should produce a row, not a tuple. -query T +query I SELECT * FROM f_one() AS foo (a INT); ---- -(1) +1 statement ok CREATE FUNCTION f_const() RETURNS RECORD AS @@ -35,7 +33,7 @@ SELECT f_const(); (1,2.0,"welcome roacher","2021-07-12 17:02:10+00") statement ok -CREATE FUNCTION f_arr() RETURNS RECORD AS +CREATE FUNCTION f_arr() RETURNS RECORD STABLE AS $$ SELECT ARRAY[1, 2, 3]; $$ LANGUAGE SQL; @@ -92,7 +90,7 @@ SELECT f_table(); (1,5) statement ok -CREATE FUNCTION f_multitable() RETURNS RECORD AS +CREATE FUNCTION f_multitable() RETURNS RECORD STABLE AS $$ SELECT t1.*, t2.* FROM t as t1 JOIN t as t2 on t1.a = t2.a ORDER BY t1.a LIMIT 1; $$ LANGUAGE SQL; @@ -130,3 +128,245 @@ query T SELECT f_table(); ---- (1,5) + +subtest datasource + +statement ok +CREATE FUNCTION f_tup() RETURNS RECORD AS +$$ + SELECT ROW(1, 2, 3); +$$ LANGUAGE SQL; + +query T +SELECT f_tup(); +---- +(1,2,3) + +statement error pq: a column definition list is required for functions returning \"record\" +SELECT * FROM f_tup(); + +query III colnames +SELECT * FROM f_tup() as foo(a int, b int, c int); +---- +a b c +1 2 3 + +statement ok +CREATE FUNCTION f_col() RETURNS RECORD AS +$$ + SELECT 1, 2, 3; +$$ LANGUAGE SQL; + +query T +SELECT f_col(); +---- +(1,2,3) + +query III colnames +SELECT * FROM f_col() as foo(a int, b int, c int); +---- +a b c +1 2 3 + +query T +SELECT * FROM (VALUES (f_col())) as foo; +---- +(1,2,3) + +statement ok +CREATE TABLE t_imp (a INT PRIMARY KEY, b INT); +INSERT INTO t_imp VALUES (1, 10), (2, 4), (3, 32); + +statement ok +CREATE FUNCTION f_imp() RETURNS t_imp AS +$$ + SELECT * FROM t_imp ORDER BY a LIMIT 1; +$$ LANGUAGE SQL; + +query II colnames +SELECT * FROM f_imp(); +---- +a b +1 10 + +statement ok +CREATE TYPE udt AS ENUM ('a', 'b', 'c'); + +statement ok +CREATE FUNCTION f_udt() RETURNS udt AS +$$ + SELECT 'a'::udt; +$$ LANGUAGE SQL; + +query T +SELECT * FROM f_udt(); +---- +a + +statement ok +CREATE FUNCTION f_udt_record() RETURNS RECORD AS +$$ + SELECT 'a'::udt; +$$ LANGUAGE SQL; + +query T +SELECT * FROM f_udt() AS foo(u udt); +---- +a + +query II rowsort +SELECT * FROM f_setof() AS foo(a INT, b INT); +---- +1 5 +2 6 +3 7 + +statement ok +CREATE FUNCTION f_setof_imp() RETURNS SETOF t_imp STABLE AS +$$ + SELECT * FROM t_imp; +$$ LANGUAGE SQL; + +query II rowsort +SELECT * FROM f_setof_imp() +---- +1 10 +2 4 +3 32 + +statement ok +CREATE FUNCTION f_strict() RETURNS RECORD STRICT AS +$$ + SELECT 1, 2, 3; +$$ LANGUAGE SQL; + +query III +SELECT * FROM f_strict() AS foo(a INT, b INT, c INT); +---- +1 2 3 + +statement ok +CREATE FUNCTION f_setof_strict() RETURNS SETOF RECORD STRICT STABLE AS +$$ + SELECT * FROM t_imp; +$$ LANGUAGE SQL; + +query II rowsort +SELECT * FROM f_setof_strict() AS foo(a INT, b INT); +---- +1 10 +2 4 +3 32 + +statement ok +CREATE FUNCTION f_strict_arg(IN a INT, IN b INT) RETURNS RECORD STRICT STABLE AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +query II colnames +SELECT * FROM f_strict_arg(1,2) AS foo(a INT, b INT); +---- +a b +1 2 + +query II colnames +SELECT * FROM f_strict_arg(NULL, 2) AS foo(a INT, b INT); +---- +a b +NULL NULL + +query T +SELECT * FROM (SELECT f_strict_arg(1, 2)); +---- +(1,2) + +statement ok +CREATE FUNCTION f_strict_arg_setof(IN a INT, IN b INT) RETURNS SETOF RECORD STRICT AS +$$ + SELECT a, b FROM generate_series(1,3); +$$ LANGUAGE SQL; + +query II colnames +SELECT * FROM f_strict_arg_setof(1,2) AS foo(a INT, b INT); +---- +a b +1 2 +1 2 +1 2 + +# Strict SETOF UDF with NULL input returns 0 rows. +query II colnames +SELECT * FROM f_strict_arg_setof(NULL,2) AS foo(a INT, b INT); +---- +a b + +statement ok +CREATE TABLE n (a INT PRIMARY KEY, b INT); +INSERT INTO n VALUES (1, 5), (2, NULL); + +query III colnames +WITH narg AS (SELECT b AS input FROM n WHERE a = 2) SELECT * FROM narg, f_strict_arg(narg.input, 2) AS foo(a INT, b INT); +---- +input a b +NULL NULL NULL + +query III colnames +WITH narg AS (SELECT b AS input FROM n WHERE a = 2) SELECT * FROM narg, f_strict_arg_SETOF(narg.input, 2) AS foo(a INT, b INT); +---- +input a b + +statement ok +CREATE FUNCTION f_arg(IN a INT8, IN b INT8) RETURNS RECORD AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +query II +SELECT * FROM f_arg(1,2) AS foo(a INT, b INT); +---- +1 2 + +# Test ambiguous function signatures with records +subtest ambiguity + +# setof +statement ok +CREATE FUNCTION f_amb_setof(a INT8, b INT8) RETURNS SETOF RECORD STRICT AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +statement ok +CREATE FUNCTION f_amb_setof(a INT8, b STRING) RETURNS SETOF RECORD STRICT AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +# TODO(harding): In postgres, calls to f_amb_setof should succeed and return 0 rows. +statement error pq: ambiguous call: f_amb_setof\(int, unknown\), candidates are +SELECT f_amb_setof(1, NULL); + +statement error pq: ambiguous call: f_amb_setof\(int, unknown\), candidates are +SELECT * FROM f_amb_setof(1, NULL) as foo(a int, b int); + +statement ok +CREATE FUNCTION f_amb(a INT, b INT) RETURNS RECORD STRICT AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +statement ok +CREATE FUNCTION f_amb(a INT, b STRING) RETURNS RECORD STRICT AS +$$ + SELECT a, b; +$$ LANGUAGE SQL; + +# TODO(harding): In postgres, calls to f_amb should succeed and return NULL. +statement error pq: ambiguous call: f_amb\(int, unknown\), candidates are +SELECT f_amb(1, NULL); + +# TODO(harding): In postgres, calls to f_amb as a data source should succeed +# and return NULL NULL. +statement error pq: ambiguous call: f_amb\(int, unknown\), candidates are +SELECT * FROM f_amb(1, NULL) as foo(a int, b int); diff --git a/pkg/sql/logictest/testdata/logic_test/udf_setof b/pkg/sql/logictest/testdata/logic_test/udf_setof index 85a67877da44..5140b407f7ff 100644 --- a/pkg/sql/logictest/testdata/logic_test/udf_setof +++ b/pkg/sql/logictest/testdata/logic_test/udf_setof @@ -166,25 +166,36 @@ CREATE FUNCTION all_ab() RETURNS SETOF ab LANGUAGE SQL AS $$ SELECT a, b FROM ab $$ -# TODO(mgartner): This should return separate columns, not a tuple. See #97059. -query T rowsort +query II rowsort SELECT * FROM all_ab() ---- -(1,10) -(2,20) -(3,30) -(4,40) +1 10 +2 20 +3 30 +4 40 statement ok CREATE FUNCTION all_ab_tuple() RETURNS SETOF ab LANGUAGE SQL AS $$ SELECT (a, b) FROM ab $$ -# TODO(mgartner): This should return separate columns, not a tuple. See #97059. -query T rowsort +query II rowsort +SELECT * FROM all_ab_tuple() +---- +1 10 +2 20 +3 30 +4 40 + +statement ok +CREATE FUNCTION all_ab_record() RETURNS SETOF RECORD LANGUAGE SQL AS $$ + SELECT a, b FROM ab +$$ + +query II rowsort SELECT * FROM all_ab_tuple() ---- -(1,10) -(2,20) -(3,30) -(4,40) +1 10 +2 20 +3 30 +4 40 diff --git a/pkg/sql/opt/exec/execbuilder/scalar.go b/pkg/sql/opt/exec/execbuilder/scalar.go index 4847a882867e..2de5e2a82f7b 100644 --- a/pkg/sql/opt/exec/execbuilder/scalar.go +++ b/pkg/sql/opt/exec/execbuilder/scalar.go @@ -695,6 +695,8 @@ func (b *Builder) buildExistsSubquery( types.Bool, false, /* enableStepping */ true, /* calledOnNullInput */ + false, /* multiColOutput */ + false, /* generator */ ), tree.DBoolFalse, }, types.Bool), nil @@ -803,6 +805,8 @@ func (b *Builder) buildSubquery( subquery.Typ, false, /* enableStepping */ true, /* calledOnNullInput */ + false, /* multiColOutput */ + false, /* generator */ ), nil } @@ -855,6 +859,8 @@ func (b *Builder) buildSubquery( subquery.Typ, false, /* enableStepping */ true, /* calledOnNullInput */ + false, /* multiColOutput */ + false, /* generator */ ), nil } @@ -937,6 +943,8 @@ func (b *Builder) buildUDF(ctx *buildScalarCtx, scalar opt.ScalarExpr) (tree.Typ udf.Typ, enableStepping, udf.CalledOnNullInput, + udf.MultiColDataSource, + udf.SetReturning, ), nil } diff --git a/pkg/sql/opt/norm/inline_funcs.go b/pkg/sql/opt/norm/inline_funcs.go index 8cac41f2d4f0..1c83b24ec723 100644 --- a/pkg/sql/opt/norm/inline_funcs.go +++ b/pkg/sql/opt/norm/inline_funcs.go @@ -411,6 +411,7 @@ func (c *CustomFuncs) InlineConstVar(f memo.FiltersExpr) memo.FiltersExpr { // 2. It has a single statement. // 3. It is not a set-returning function. // 4. Its arguments are only Variable or Const expressions. +// 5. It is not a record-returning function. // // UDFs with mutations (INSERT, UPDATE, UPSERT, DELETE) cannot be inlined, but // we do not need an explicit check for this because immutable UDFs cannot @@ -434,8 +435,13 @@ func (c *CustomFuncs) InlineConstVar(f memo.FiltersExpr) memo.FiltersExpr { // referenced in the UDF body more than once. Any argument that contains a // subquery and is referenced multiple times cannot be inlined, unless new // columns IDs for the entire subquery are generated (see #100915). +// +// TODO(harding): We could potentially loosen (5), since only record-returning +// UDFs used as data sources return multiple columns. Other UDFs returning a +// single column can be inlined since subqueries can only return a single +// column. func (c *CustomFuncs) IsInlinableUDF(args memo.ScalarListExpr, udfp *memo.UDFPrivate) bool { - if udfp.Volatility == volatility.Volatile || len(udfp.Body) != 1 || udfp.SetReturning { + if udfp.Volatility == volatility.Volatile || len(udfp.Body) != 1 || udfp.SetReturning || udfp.MultiColDataSource { return false } if !args.IsConstantsAndPlaceholdersAndVariables() { diff --git a/pkg/sql/opt/norm/testdata/rules/inline b/pkg/sql/opt/norm/testdata/rules/inline index 5f3c698afa1f..9d0b51692b9f 100644 --- a/pkg/sql/opt/norm/testdata/rules/inline +++ b/pkg/sql/opt/norm/testdata/rules/inline @@ -1327,7 +1327,7 @@ project └── plus [type=int] ├── variable: x:17 [type=int] └── variable: y:18 [type=int] - ] [17 18] int false immutable true} [as=add:20, outer=(2), immutable, correlated-subquery, udf] + ] [17 18] int false immutable true false} [as=add:20, outer=(2), immutable, correlated-subquery, udf] ├── 10 └── subquery └── values @@ -1579,7 +1579,7 @@ project └── eq [type=bool] ├── variable: y:10 [type=int] └── const: 2 [type=int] - ] [9 10] bool false immutable true} [as=x_0_1_y_2:12, outer=(2), immutable, correlated-subquery, udf] + ] [9 10] bool false immutable true false} [as=x_0_1_y_2:12, outer=(2), immutable, correlated-subquery, udf] ├── subquery │ └── max1-row │ ├── columns: column1:8!null @@ -1600,3 +1600,23 @@ project │ └── filters │ └── column1:8 > i:2 [outer=(2,8), constraints=(/2: (/NULL - ]; /8: (/NULL - ])] └── 10 + +exec-ddl +CREATE FUNCTION x_y(x INT, y INT) RETURNS RECORD IMMUTABLE LANGUAGE SQL AS $$ + SELECT x, y; +$$ +---- + +# Do not inline a UDF that outputs multiple columns. +norm expect-not=InlineUDF +SELECT * FROM x_y(1, 2) AS f(x INT, y INT); +---- +project-set + ├── columns: x:5 y:6 + ├── immutable + ├── values + │ ├── cardinality: [1 - 1] + │ ├── key: () + │ └── () + └── zip + └── x_y(1, 2) [immutable, udf] diff --git a/pkg/sql/opt/norm/testdata/rules/udf b/pkg/sql/opt/norm/testdata/rules/udf index 56e432053bfe..cc729d7952b5 100644 --- a/pkg/sql/opt/norm/testdata/rules/udf +++ b/pkg/sql/opt/norm/testdata/rules/udf @@ -24,7 +24,7 @@ values └── const: 1 exec-ddl -CREATE FUNCTION strict_fn(i INT, t TEXT, b BOOl) RETURNS INT STRICT LANGUAGE SQL AS 'SELECT i' +CREATE FUNCTION strict_fn(i INT, t TEXT, b BOOl) RETURNS INT STRICT IMMUTABLE LANGUAGE SQL AS 'SELECT i' ---- # Strict UDFs should be folded to NULL in the presence of a constant NULL @@ -33,10 +33,10 @@ norm format=show-scalars SELECT strict_fn(1, 'foo', NULL) ---- values - ├── columns: strict_fn:1 + ├── columns: strict_fn:5 ├── cardinality: [1 - 1] ├── key: () - ├── fd: ()-->(1) + ├── fd: ()-->(5) └── tuple └── null @@ -48,23 +48,14 @@ SELECT strict_fn(1, 'foo', true) values ├── columns: strict_fn:5 ├── cardinality: [1 - 1] - ├── volatile ├── key: () ├── fd: ()-->(5) └── tuple - └── udf: strict_fn - ├── strict - ├── params: i:1 t:2 b:3 - ├── args - │ ├── const: 1 - │ ├── const: 'foo' - │ └── true - └── body - └── values - ├── columns: i:4 - ├── outer: (1) - ├── cardinality: [1 - 1] - ├── key: () - ├── fd: ()-->(4) - └── tuple - └── variable: i:1 + └── subquery + └── values + ├── columns: i:4!null + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + └── tuple + └── const: 1 diff --git a/pkg/sql/opt/ops/scalar.opt b/pkg/sql/opt/ops/scalar.opt index eb39882cee14..827a68aec1bf 100644 --- a/pkg/sql/opt/ops/scalar.opt +++ b/pkg/sql/opt/ops/scalar.opt @@ -1282,6 +1282,11 @@ define UDFPrivate { # operators. Non-scalar UDFs are always in a project-set operator, while # scalar UDFs can be if used as a data source (e.g. SELECT * FROM udf()). CalledOnNullInput bool + + # MultiColDataSource is true if the function may return multiple columns. This + # is only the case if the UDF returns a RECORD type and is used as a data + # source. + MultiColDataSource bool } # KVOptions is a set of KVOptionItems that specify arbitrary keys and values diff --git a/pkg/sql/opt/optbuilder/builder.go b/pkg/sql/opt/optbuilder/builder.go index 61f5ec53978f..835085027b8e 100644 --- a/pkg/sql/opt/optbuilder/builder.go +++ b/pkg/sql/opt/optbuilder/builder.go @@ -137,6 +137,9 @@ type Builder struct { // within. insideUDF bool + // insideDataSource is true when we are processing a data source. + insideDataSource bool + // If set, we are collecting view dependencies in schemaDeps. This can only // happen inside view/function definitions. // diff --git a/pkg/sql/opt/optbuilder/scalar.go b/pkg/sql/opt/optbuilder/scalar.go index 87a1fbe9fe20..665bd81fc96f 100644 --- a/pkg/sql/opt/optbuilder/scalar.go +++ b/pkg/sql/opt/optbuilder/scalar.go @@ -695,6 +695,7 @@ func (b *Builder) buildUDF( // We'll need to track the depth of the UDFs we are building expressions // within. b.insideUDF = true + isMultiColDataSource := false for i := range stmts { stmtScope := b.buildStmt(stmts[i].AST, nil /* desiredTypes */, bodyScope) expr := stmtScope.expr @@ -715,34 +716,69 @@ func (b *Builder) buildUDF( physProps.Ordering = props.OrderingChoice{} } - // Replace the tuple contents of RECORD return types from Any to the - // result columns of the last statement. If the result column is a tuple, - // then use its tuple contents for the return instead. + // If returning a RECORD type, the function return type needs to be + // modified because when we first parse the CREATE FUNCTION, the RECORD + // is represented as a tuple with any types and execution requires the + // types to be concrete in order to decode them correctly. We can + // determine the types from the result columns or tuple of the last + // statement. isSingleTupleResult := len(stmtScope.cols) == 1 && stmtScope.cols[0].typ.Family() == types.TupleFamily - if types.IsRecordType(f.ResolvedType()) { + if types.IsRecordType(rtyp) { if isSingleTupleResult { - f.ResolvedType().InternalType.TupleContents = stmtScope.cols[0].typ.TupleContents() + // When the final statement returns a single tuple, we can use the + // tuple's types as the function return type. + rtyp = stmtScope.cols[0].typ } else { + // Get the types from the individual columns of the last statement. tc := make([]*types.T, len(stmtScope.cols)) + tl := make([]string, len(stmtScope.cols)) for i, col := range stmtScope.cols { tc[i] = col.typ + tl[i] = col.name.MetadataName() } - f.ResolvedType().InternalType.TupleContents = tc + rtyp = types.MakeLabeledTuple(tc, tl) } + f.SetTypeAnnotation(rtyp) } - // If there are multiple output columns or the output type is a record and - // the output column is not a tuple, we must combine them into a tuple - - // only a single column can be returned from a UDF. + // Only a single column can be returned from a UDF, unless it is used as a + // data source. Data sources may output multiple columns, and if the + // statement body produces a tuple it needs to be expanded into columns. + // When not used as a data source, combine statements producing multiple + // columns into a tuple. If the last statement is already returning a + // tuple and the function has a record return type, then we do not need to + // wrap the output in another tuple. cols := physProps.Presentation - if len(cols) > 1 || (types.IsRecordType(f.ResolvedType()) && !isSingleTupleResult) { + if b.insideDataSource && rtyp.Family() == types.TupleFamily { + // When the UDF is used as a data source and expects to output a tuple + // type, its output needs to be a row of columns instead of the usual + // tuple. If the last statement output a tuple, we need to expand the + // tuple into individual columns. + isMultiColDataSource = true + if isSingleTupleResult { + stmtScope = bodyScope.push() + elems := make([]scopeColumn, len(rtyp.TupleContents())) + for i := range rtyp.TupleContents() { + e := b.factory.ConstructColumnAccess(b.factory.ConstructVariable(cols[0].ID), memo.TupleOrdinal(i)) + col := b.synthesizeColumn(stmtScope, scopeColName(""), rtyp.TupleContents()[i], nil, e) + elems[i] = *col + } + expr = b.constructProject(expr, elems) + physProps = stmtScope.makePhysicalProps() + } + } else if len(cols) > 1 || (types.IsRecordType(rtyp) && !isSingleTupleResult) { + // Only a single column can be returned from a UDF, unless it is used as a + // data source (see comment above). If there are multiple columns, combine + // them into a tuple. If the last statement is already returning a tuple + // and the function has a record return type, then do not wrap the + // output in another tuple. elems := make(memo.ScalarListExpr, len(cols)) for i := range cols { elems[i] = b.factory.ConstructVariable(cols[i].ID) } - tup := b.factory.ConstructTuple(elems, f.ResolvedType()) + tup := b.factory.ConstructTuple(elems, rtyp) stmtScope = bodyScope.push() - col := b.synthesizeColumn(stmtScope, scopeColName(""), f.ResolvedType(), nil /* expr */, tup) + col := b.synthesizeColumn(stmtScope, scopeColName(""), rtyp, nil /* expr */, tup) expr = b.constructProject(expr, []scopeColumn{*col}) physProps = stmtScope.makePhysicalProps() } @@ -755,17 +791,17 @@ func (b *Builder) buildUDF( // column is already a tuple. returnCol := physProps.Presentation[0].ID returnColMeta := b.factory.Metadata().ColumnMeta(returnCol) - if !types.IsRecordType(f.ResolvedType()) && !returnColMeta.Type.Identical(f.ResolvedType()) { - if !cast.ValidCast(returnColMeta.Type, f.ResolvedType(), cast.ContextAssignment) { + if !types.IsRecordType(rtyp) && !isMultiColDataSource && !returnColMeta.Type.Identical(rtyp) { + if !cast.ValidCast(returnColMeta.Type, rtyp, cast.ContextAssignment) { panic(sqlerrors.NewInvalidAssignmentCastError( - returnColMeta.Type, f.ResolvedType(), returnColMeta.Alias)) + returnColMeta.Type, rtyp, returnColMeta.Alias)) } cast := b.factory.ConstructAssignmentCast( b.factory.ConstructVariable(physProps.Presentation[0].ID), - f.ResolvedType(), + rtyp, ) stmtScope = bodyScope.push() - col := b.synthesizeColumn(stmtScope, scopeColName(""), f.ResolvedType(), nil /* expr */, cast) + col := b.synthesizeColumn(stmtScope, scopeColName(""), rtyp, nil /* expr */, cast) expr = b.constructProject(expr, []scopeColumn{*col}) physProps = stmtScope.makePhysicalProps() } @@ -781,20 +817,29 @@ func (b *Builder) buildUDF( out = b.factory.ConstructUDF( args, &memo.UDFPrivate{ - Name: def.Name, - Params: params, - Body: rels, - Typ: f.ResolvedType(), - SetReturning: isSetReturning, - Volatility: o.Volatility, - CalledOnNullInput: o.CalledOnNullInput, + Name: def.Name, + Params: params, + Body: rels, + Typ: f.ResolvedType(), + SetReturning: isSetReturning, + Volatility: o.Volatility, + CalledOnNullInput: o.CalledOnNullInput, + MultiColDataSource: isMultiColDataSource, }, ) - // Synthesize an output column for set-returning UDFs. - if isSetReturning && outCol == nil { - outCol = b.synthesizeColumn(outScope, scopeColName(""), f.ResolvedType(), nil /* expr */, out) + // Synthesize an output columns if necessary. + if outCol == nil { + if isMultiColDataSource { + // TODO(harding): Add the returns record property during create function. + f.ResolvedOverload().ReturnsRecordType = types.IsRecordType(rtyp) + return b.finishBuildGeneratorFunction(f, f.ResolvedOverload(), out, inScope, outScope, outCol) + } + if outScope != nil { + outCol = b.synthesizeColumn(outScope, scopeColName(""), f.ResolvedType(), nil /* expr */, out) + } } + return b.finishBuildScalar(f, out, inScope, outScope, outCol) } diff --git a/pkg/sql/opt/optbuilder/select.go b/pkg/sql/opt/optbuilder/select.go index 1a3806d6fcc7..124454b34396 100644 --- a/pkg/sql/opt/optbuilder/select.go +++ b/pkg/sql/opt/optbuilder/select.go @@ -50,10 +50,12 @@ import ( func (b *Builder) buildDataSource( texpr tree.TableExpr, indexFlags *tree.IndexFlags, locking lockingSpec, inScope *scope, ) (outScope *scope) { - defer func(prevAtRoot bool) { + defer func(prevAtRoot bool, prevInsideDataSource bool) { inScope.atRoot = prevAtRoot - }(inScope.atRoot) + b.insideDataSource = prevInsideDataSource + }(inScope.atRoot, b.insideDataSource) inScope.atRoot = false + b.insideDataSource = true // NB: The case statements are sorted lexicographically. switch source := (texpr).(type) { case *tree.AliasedTableExpr: @@ -943,6 +945,12 @@ func (b *Builder) buildWithOrdinality(inScope *scope) (outScope *scope) { func (b *Builder) buildSelectStmt( stmt tree.SelectStatement, locking lockingSpec, desiredTypes []*types.T, inScope *scope, ) (outScope *scope) { + // The top level in a select statement is not considered a data source. + oldInsideDataSource := b.insideDataSource + defer func() { + b.insideDataSource = oldInsideDataSource + }() + b.insideDataSource = false // NB: The case statements are sorted lexicographically. switch stmt := stmt.(type) { case *tree.LiteralValuesClause: diff --git a/pkg/sql/opt/optbuilder/srfs.go b/pkg/sql/opt/optbuilder/srfs.go index a93cdc802304..039e79ab5959 100644 --- a/pkg/sql/opt/optbuilder/srfs.go +++ b/pkg/sql/opt/optbuilder/srfs.go @@ -111,7 +111,9 @@ func (b *Builder) buildZip(exprs tree.Exprs, inScope *scope) (outScope *scope) { var outCol *scopeColumn startCols := len(outScope.cols) - if def == nil || funcExpr.ResolvedOverload().Class != tree.GeneratorClass || b.shouldCreateDefaultColumn(texpr) { + isRecordReturningUDF := def != nil && funcExpr.ResolvedOverload().IsUDF && texpr.ResolvedType().Family() == types.TupleFamily && b.insideDataSource + + if def == nil || (funcExpr.ResolvedOverload().Class != tree.GeneratorClass && !isRecordReturningUDF) || (b.shouldCreateDefaultColumn(texpr) && !isRecordReturningUDF) { if def != nil && len(funcExpr.ResolvedOverload().ReturnLabels) > 0 { // Override the computed alias with the one defined in the ReturnLabels. This // satisfies a Postgres quirk where some json functions use different labels diff --git a/pkg/sql/opt/optbuilder/testdata/udf b/pkg/sql/opt/optbuilder/testdata/udf index c2121596d274..6e8ca2d3a858 100644 --- a/pkg/sql/opt/optbuilder/testdata/udf +++ b/pkg/sql/opt/optbuilder/testdata/udf @@ -1166,6 +1166,73 @@ project └── tuple └── const: 1 +exec-ddl +CREATE FUNCTION strict_fn_record(i INT, t TEXT, b BOOl) RETURNS RECORD STRICT LANGUAGE SQL AS 'SELECT i, t, b' +---- + +build format=show-scalars +SELECT * FROM strict_fn_record(1, 'foo', false) as bar(i INT, t TEXT, b BOOl) +---- +project-set + ├── columns: i:7 t:8 b:9 + ├── values + │ └── tuple + └── zip + └── udf: strict_fn_record + ├── strict + ├── params: i:1 t:2 b:3 + ├── args + │ ├── const: 1 + │ ├── const: 'foo' + │ └── false + └── body + └── limit + ├── columns: i:4 t:5 b:6 + ├── project + │ ├── columns: i:4 t:5 b:6 + │ ├── values + │ │ └── tuple + │ └── projections + │ ├── variable: i:1 [as=i:4] + │ ├── variable: t:2 [as=t:5] + │ └── variable: b:3 [as=b:6] + └── const: 1 + +build format=show-scalars +SELECT * FROM (SELECT strict_fn_record(1, 'foo', false)) as bar; +---- +project + ├── columns: strict_fn_record:8 + ├── values + │ └── tuple + └── projections + └── udf: strict_fn_record [as=strict_fn_record:8] + ├── strict + ├── params: i:1 t:2 b:3 + ├── args + │ ├── const: 1 + │ ├── const: 'foo' + │ └── false + └── body + └── project + ├── columns: column7:7 + ├── limit + │ ├── columns: i:4 t:5 b:6 + │ ├── project + │ │ ├── columns: i:4 t:5 b:6 + │ │ ├── values + │ │ │ └── tuple + │ │ └── projections + │ │ ├── variable: i:1 [as=i:4] + │ │ ├── variable: t:2 [as=t:5] + │ │ └── variable: b:3 [as=b:6] + │ └── const: 1 + └── projections + └── tuple [as=column7:7] + ├── variable: i:4 + ├── variable: t:5 + └── variable: b:6 + # -------------------------------------------------- # UDFs with * expressions. diff --git a/pkg/sql/opt/optbuilder/util.go b/pkg/sql/opt/optbuilder/util.go index b9755b7a66e9..4862e650ba6c 100644 --- a/pkg/sql/opt/optbuilder/util.go +++ b/pkg/sql/opt/optbuilder/util.go @@ -186,7 +186,7 @@ func (b *Builder) expandStarAndResolveType( // example, the query `SELECT (x + 1) AS "x_incr" FROM t` has a projection with // a synthesized column "x_incr". // -// scope The scope is passed in so it can can be updated with the newly bound +// scope The scope is passed in so it can be updated with the newly bound // // variable. // diff --git a/pkg/sql/routine.go b/pkg/sql/routine.go index e8aad96d34a1..11a93dd5deb9 100644 --- a/pkg/sql/routine.go +++ b/pkg/sql/routine.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/errors" ) // EvalRoutineExpr returns the result of evaluating the routine. It calls the @@ -121,7 +122,17 @@ func (g *routineGenerator) ResolvedType() *types.T { // is cache-able (i.e., there are no arguments to the routine and stepping is // disabled). func (g *routineGenerator) Start(ctx context.Context, txn *kv.Txn) (err error) { - retTypes := []*types.T{g.expr.ResolvedType()} + rt := g.expr.ResolvedType() + var retTypes []*types.T + if g.expr.MultiColOutput { + // A routine with multiple output column should have its types in a tuple. + if rt.Family() != types.TupleFamily { + return errors.AssertionFailedf("routine expected to return multiple columns") + } + retTypes = rt.TupleContents() + } else { + retTypes = []*types.T{g.expr.ResolvedType()} + } g.rch.Init(ctx, retTypes, g.p.ExtendedEvalContext(), "routine" /* opName */) // Configure stepping for volatile routines so that mutations made by the diff --git a/pkg/sql/rowexec/project_set.go b/pkg/sql/rowexec/project_set.go index 2ab9619a342c..969f1b0ce6ce 100644 --- a/pkg/sql/rowexec/project_set.go +++ b/pkg/sql/rowexec/project_set.go @@ -181,6 +181,14 @@ func (ps *projectSetProcessor) nextInputRow() ( gen, err = eval.GetFuncGenerator(ps.Ctx(), ps.EvalCtx, t) case *tree.RoutineExpr: gen, err = eval.GetRoutineGenerator(ps.Ctx(), ps.EvalCtx, t) + if err == nil && gen == nil && t.MultiColOutput && !t.Generator { + // If the routine will return multiple output columns, we expect the + // routine to return nulls for each column type instead of no rows, so + // we can't use the empty generator. Set-returning routines + // (i.e., Generators), have different behavior and are handled + // separately. + gen, err = builtins.NullGenerator(t.ResolvedType()) + } default: return nil, nil, errors.AssertionFailedf("unexpected expression in project-set: %T", fn) } diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 9b52997b698a..89a47b3fceac 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -1254,6 +1254,20 @@ func EmptyGenerator() eval.ValueGenerator { return &arrayValueGenerator{array: tree.NewDArray(types.Any)} } +// NullGenerator returns a new generator that returns a single row of nulls +// corresponding to the types stored in the tuple typ. +func NullGenerator(typ *types.T) (eval.ValueGenerator, error) { + if typ.Family() != types.TupleFamily { + return nil, errors.AssertionFailedf("generator expected to return multiple columns") + } + arrs := make([]*tree.DArray, len(typ.TupleContents())) + for i := range typ.TupleContents() { + arrs[i] = &tree.DArray{} + arrs[i].Array = tree.Datums{tree.DNull} + } + return &multipleArrayValueGenerator{arrays: arrs}, nil +} + // unaryValueGenerator supports the execution of crdb_internal.unary_table(). type unaryValueGenerator struct { done bool diff --git a/pkg/sql/sem/eval/generators.go b/pkg/sql/sem/eval/generators.go index 7a03a5812d08..8d12ce397a55 100644 --- a/pkg/sql/sem/eval/generators.go +++ b/pkg/sql/sem/eval/generators.go @@ -53,7 +53,7 @@ func GetRoutineGenerator( if args[i] == tree.DNull { // Strict routines (CalledOnNullInput=false) should not be // invoked if any of their arguments are NULL. Return nil so - // that the EmptyGenerator is used. + // that the EmptyGenerator or NullGenerator is used. return nil, nil } } diff --git a/pkg/sql/sem/tree/expr.go b/pkg/sql/sem/tree/expr.go index c8d56c0cfec6..5c17e3d4b58c 100644 --- a/pkg/sql/sem/tree/expr.go +++ b/pkg/sql/sem/tree/expr.go @@ -1327,6 +1327,10 @@ func (node *FuncExpr) IsVectorizeStreaming() bool { return node.fnProps != nil && node.fnProps.VectorizeStreaming } +func (node *FuncExpr) SetTypeAnnotation(t *types.T) { + node.typ = t +} + type funcType int // FuncExpr.Type diff --git a/pkg/sql/sem/tree/routine.go b/pkg/sql/sem/tree/routine.go index ec44f45fa4ba..384883fb510c 100644 --- a/pkg/sql/sem/tree/routine.go +++ b/pkg/sql/sem/tree/routine.go @@ -105,6 +105,12 @@ type RoutineExpr struct { // Strict non-set-returning routines are not invoked when their arguments // are NULL because optbuilder wraps them in a CASE expressions. CalledOnNullInput bool + + // MultiColOutput is true if the function may return multiple columns. + MultiColOutput bool + + // Generator is true if the function may output a set of rows. + Generator bool } // NewTypedRoutineExpr returns a new RoutineExpr that is well-typed. @@ -115,6 +121,8 @@ func NewTypedRoutineExpr( typ *types.T, enableStepping bool, calledOnNullInput bool, + multiColOutput bool, + generator bool, ) *RoutineExpr { return &RoutineExpr{ Args: args, @@ -123,6 +131,8 @@ func NewTypedRoutineExpr( EnableStepping: enableStepping, Name: name, CalledOnNullInput: calledOnNullInput, + MultiColOutput: multiColOutput, + Generator: generator, } } diff --git a/pkg/sql/sem/tree/type_check.go b/pkg/sql/sem/tree/type_check.go index 85f4482a8051..fe99f42905b2 100644 --- a/pkg/sql/sem/tree/type_check.go +++ b/pkg/sql/sem/tree/type_check.go @@ -1082,6 +1082,7 @@ func (expr *FuncExpr) TypeCheck( return nil, pgerror.Wrapf(err, pgcode.InvalidParameterValue, "%s()", def.Name) } + var hasUDFOverload bool var calledOnNullInputFns, notCalledOnNullInputFns intsets.Fast for _, idx := range s.overloadIdxs { if def.Overloads[idx].CalledOnNullInput { @@ -1089,6 +1090,10 @@ func (expr *FuncExpr) TypeCheck( } else { notCalledOnNullInputFns.Add(int(idx)) } + // TODO(harding): Check if this is a record-returning UDF instead. + if def.Overloads[idx].IsUDF { + hasUDFOverload = true + } } // If the function is an aggregate that does not accept null arguments and we @@ -1135,7 +1140,7 @@ func (expr *FuncExpr) TypeCheck( // NULL arguments, the function isn't a generator or aggregate builtin, and // NULL is given as an argument. if len(s.overloadIdxs) > 0 && calledOnNullInputFns.Len() == 0 && funcCls != GeneratorClass && - funcCls != AggregateClass { + funcCls != AggregateClass && !hasUDFOverload { for _, expr := range s.typedExprs { if expr.ResolvedType().Family() == types.UnknownFamily { return DNull, nil