From a8a3b75ba509a63192e7fdd9e65621f210baf326 Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Fri, 17 Mar 2023 14:59:04 -0400 Subject: [PATCH] sql: support array-flatten subqueries within UDFs Array-flatten subqueries (e.g., `ARRAY(SELECT a FROM t)`) are now supported within UDFs. They are now converted to a normal subquery with a ScalarGroupBy if they exist within a UDF, even if they are uncorrelated. This allows them to be executed without any changes to the execbuilder or the evaluation logic of `tree.Routine`. Fixes #98738 Release note: None --- pkg/sql/logictest/testdata/logic_test/udf | 28 ++++++++++-- pkg/sql/opt/exec/execbuilder/scalar.go | 14 ++++-- .../opt/exec/execbuilder/testdata/subquery | 7 +++ pkg/sql/opt/exec/execbuilder/testdata/udf | 24 ++++++++++ pkg/sql/opt/norm/rules/scalar.opt | 18 ++++---- pkg/sql/opt/norm/scalar_funcs.go | 6 +++ pkg/sql/opt/norm/testdata/rules/scalar | 44 +++++++++++++++++++ pkg/sql/opt/ops/scalar.opt | 4 ++ pkg/sql/opt/optbuilder/scalar.go | 8 +--- pkg/sql/opt/xform/testdata/rules/cycle | 2 +- 10 files changed, 129 insertions(+), 26 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/udf b/pkg/sql/logictest/testdata/logic_test/udf index 449c1479f660..0fb6cf4ff742 100644 --- a/pkg/sql/logictest/testdata/logic_test/udf +++ b/pkg/sql/logictest/testdata/logic_test/udf @@ -2945,6 +2945,26 @@ SELECT all_fn(1), all_fn(2), all_fn(NULL::INT) NULL false NULL +subtest array_flatten + +statement ok +CREATE FUNCTION arr(x INT) RETURNS INT[] LANGUAGE SQL AS $$ + SELECT ARRAY(VALUES (1), (2), (x)); +$$ + +query T +SELECT arr(10) +---- +{1,2,10} + +query T +SELECT arr(i) FROM generate_series(1, 3) g(i) +---- +{1,2,1} +{1,2,2} +{1,2,3} + + subtest variadic # Variadic UDFS are not currently supported. @@ -3054,10 +3074,10 @@ SELECT oid, proname, pronamespace, proowner, prolang, proleakproof, proisstrict, FROM pg_catalog.pg_proc WHERE proname IN ('f_93314', 'f_93314_alias', 'f_93314_comp', 'f_93314_comp_t') ORDER BY oid; ---- -100271 f_93314 105 1546506610 14 false false false v 0 100270 · {} NULL SELECT i, e FROM test.public.t_93314 ORDER BY i LIMIT 1; -100273 f_93314_alias 105 1546506610 14 false false false v 0 100272 · {} NULL SELECT i, e FROM test.public.t_93314_alias ORDER BY i LIMIT 1; -100277 f_93314_comp 105 1546506610 14 false false false v 0 100274 · {} NULL SELECT (1, 2); -100278 f_93314_comp_t 105 1546506610 14 false false false v 0 100276 · {} NULL SELECT a, c FROM test.public.t_93314_comp LIMIT 1; +100272 f_93314 105 1546506610 14 false false false v 0 100271 · {} NULL SELECT i, e FROM test.public.t_93314 ORDER BY i LIMIT 1; +100274 f_93314_alias 105 1546506610 14 false false false v 0 100273 · {} NULL SELECT i, e FROM test.public.t_93314_alias ORDER BY i LIMIT 1; +100278 f_93314_comp 105 1546506610 14 false false false v 0 100275 · {} NULL SELECT (1, 2); +100279 f_93314_comp_t 105 1546506610 14 false false false v 0 100277 · {} NULL SELECT a, c FROM test.public.t_93314_comp LIMIT 1; # Regression test for #95240. Strict UDFs that are inlined should result in NULL # when presented with NULL arguments. diff --git a/pkg/sql/opt/exec/execbuilder/scalar.go b/pkg/sql/opt/exec/execbuilder/scalar.go index dab23683ba2e..e7a04d612575 100644 --- a/pkg/sql/opt/exec/execbuilder/scalar.go +++ b/pkg/sql/opt/exec/execbuilder/scalar.go @@ -495,6 +495,16 @@ func (b *Builder) buildArrayFlatten( panic(errors.AssertionFailedf("input to ArrayFlatten should be uncorrelated")) } + if b.planLazySubqueries { + // The NormalizeArrayFlattenToAgg rule should have converted an + // ArrayFlatten within a UDF into an aggregation. + // We don't yet convert an ArrayFlatten within a correlated subquery + // into an aggregation, so we return a decorrelation error. + // TODO(mgartner): Build an ArrayFlatten within a correlated subquery as + // a Routine, or apply NormalizeArrayFlattenToAgg to all ArrayFlattens. + return nil, b.decorrelationError() + } + root, err := b.buildRelational(af.Input) if err != nil { return nil, err @@ -762,10 +772,6 @@ func (b *Builder) buildSubquery( // because we don't need to optimize the subquery input any further. // It's already been fully optimized because it is uncorrelated and has // no outer columns. - // - // TODO(mgartner): Uncorrelated subqueries only need to be evaluated - // once. We should cache their result to avoid all this overhead for - // every invocation. inputRowCount := int64(input.Relational().Statistics().RowCountIfAvailable()) withExprs := make([]builtWithExpr, len(b.withExprs)) copy(withExprs, b.withExprs) diff --git a/pkg/sql/opt/exec/execbuilder/testdata/subquery b/pkg/sql/opt/exec/execbuilder/testdata/subquery index 9673591b71cb..6acffc7c9805 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/subquery +++ b/pkg/sql/opt/exec/execbuilder/testdata/subquery @@ -562,3 +562,10 @@ vectorized: true estimated row count: 1,000 (missing stats) table: corr@corr_pkey spans: FULL SCAN + +# Case where a correlated subquery contains an uncorrelated array-flatten +# subquery. +statement error could not decorrelate subquery +SELECT + CASE WHEN k < 5 THEN (SELECT array(SELECT 1) FROM corr tmp WHERE k*10 = corr.k) END +FROM corr diff --git a/pkg/sql/opt/exec/execbuilder/testdata/udf b/pkg/sql/opt/exec/execbuilder/testdata/udf index e33456450b96..71ac848d8b33 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/udf +++ b/pkg/sql/opt/exec/execbuilder/testdata/udf @@ -235,6 +235,30 @@ SELECT sub_fn4() FROM generate_series(1, 3) 1 1 +statement ok +CREATE FUNCTION arr() RETURNS INT[] LANGUAGE SQL AS $$ + SELECT ARRAY(VALUES (1), (2)); +$$ + +# A query with a uncorrelated array-flatten within a UDF. +query T +EXPLAIN (VERBOSE) SELECT arr() FROM generate_series(1, 3) +---- +distribution: local +vectorized: true +· +• render +│ columns: (arr) +│ render arr: arr() +│ +└── • project set + │ columns: (generate_series) + │ estimated row count: 10 + │ render 0: generate_series(1, 3) + │ + └── • emptyrow + columns: () + subtest regressions diff --git a/pkg/sql/opt/norm/rules/scalar.opt b/pkg/sql/opt/norm/rules/scalar.opt index 6e9866c3d0ed..3cec5a05cc0c 100644 --- a/pkg/sql/opt/norm/rules/scalar.opt +++ b/pkg/sql/opt/norm/rules/scalar.opt @@ -383,15 +383,13 @@ $input # an aggregation). So it's desirable to perform this conversion in the # interest of decorrelation. # -# So the outcome is that we can perform uncorrelated ARRAY(...)s over any datatype, -# and correlated ones only over the types that array_agg supports. -# -# Note that optbuilder should have already verified that if the input is -# correlated, then we can array_agg over the input type. Also note that the -# Max1Row operator we introduce is guaranteed to be eliminated as -# MakeArrayAggForFlatten will return a ScalarGroupBy. +# So the outcome is that we can perform uncorrelated ARRAY(...)s over any +# datatype, and correlated ones only over the types that array_agg supports. [NormalizeArrayFlattenToAgg, Normalize] -(ArrayFlatten $input:(HasOuterCols $input) $subquery:*) +(ArrayFlatten + $input:* + $private:* & (CanNormalizeArrayFlatten $input $private) +) => (Coalesce [ @@ -403,7 +401,7 @@ $input (ArrayAgg (Variable $requestedCol:(SubqueryRequestedCol - $subquery + $private ) ) ) @@ -414,7 +412,7 @@ $input ] (MakeGrouping (MakeEmptyColSet) - (SubqueryOrdering $subquery) + (SubqueryOrdering $private) ) ) (MakeUnorderedSubquery) diff --git a/pkg/sql/opt/norm/scalar_funcs.go b/pkg/sql/opt/norm/scalar_funcs.go index 2fc808f78586..cd7125b659c3 100644 --- a/pkg/sql/opt/norm/scalar_funcs.go +++ b/pkg/sql/opt/norm/scalar_funcs.go @@ -387,3 +387,9 @@ func (c *CustomFuncs) SplitTupleEq(lhs, rhs *memo.TupleExpr) memo.FiltersExpr { } return res } + +// CanNormalizeArrayFlatten returns true if the input is correlated or if the +// ArrayFlatten exists within a UDF. +func (c *CustomFuncs) CanNormalizeArrayFlatten(input memo.RelExpr, p *memo.SubqueryPrivate) bool { + return c.HasOuterCols(input) || p.WithinUDF +} diff --git a/pkg/sql/opt/norm/testdata/rules/scalar b/pkg/sql/opt/norm/testdata/rules/scalar index 0bb77892c381..8d710fe008a8 100644 --- a/pkg/sql/opt/norm/testdata/rules/scalar +++ b/pkg/sql/opt/norm/testdata/rules/scalar @@ -1985,6 +1985,50 @@ project ├── columns: k:8!null └── key: (8) +exec-ddl +CREATE FUNCTION arr() RETURNS INT[] LANGUAGE SQL AS $$ + SELECT ARRAY(VALUES (1), (2)); +$$ +---- + +# Should trigger for uncorrelated ArrayFlatten subqueries within a UDF +norm expect=NormalizeArrayFlattenToAgg format=show-scalars +SELECT arr() +---- +values + ├── columns: arr:4 + ├── cardinality: [1 - 1] + ├── volatile + ├── key: () + ├── fd: ()-->(4) + └── tuple + └── udf: arr + └── body + └── values + ├── columns: array:3 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(3) + └── tuple + └── coalesce + ├── subquery + │ └── scalar-group-by + │ ├── columns: array_agg:2 + │ ├── cardinality: [1 - 1] + │ ├── key: () + │ ├── fd: ()-->(2) + │ ├── values + │ │ ├── columns: column1:1!null + │ │ ├── cardinality: [2 - 2] + │ │ ├── tuple + │ │ │ └── const: 1 + │ │ └── tuple + │ │ └── const: 2 + │ └── aggregations + │ └── array-agg [as=array_agg:2, outer=(1)] + │ └── variable: column1:1 + └── const: ARRAY[] + exec-ddl CREATE TABLE pg_class ( oid OID NULL, diff --git a/pkg/sql/opt/ops/scalar.opt b/pkg/sql/opt/ops/scalar.opt index c35cf2e3b3e6..e5e38e67a6bf 100644 --- a/pkg/sql/opt/ops/scalar.opt +++ b/pkg/sql/opt/ops/scalar.opt @@ -43,6 +43,10 @@ define SubqueryPrivate { # will eventually be output. It is only used for ArrayFlatten expressions. RequestedCol ColumnID + # WithinUDF is set to true if the subquery exists inside a UDFExpr. It is + # only used for ArrayFlatten expressions. + WithinUDF bool + # Cmp is only used for AnyOp. Cmp Operator diff --git a/pkg/sql/opt/optbuilder/scalar.go b/pkg/sql/opt/optbuilder/scalar.go index 96ddb5162894..a6b51cada050 100644 --- a/pkg/sql/opt/optbuilder/scalar.go +++ b/pkg/sql/opt/optbuilder/scalar.go @@ -149,14 +149,7 @@ func (b *Builder) buildScalar( inCol := s.cols[0].id - // This looks kind of arbitrary and strange, because it is: - // We cannot array_agg over some types, but we can only decorrelate via array_agg. - // Thus, we reject a query that is correlated and over a type that we can't array_agg. typ := b.factory.Metadata().ColumnMeta(inCol).Type - if !s.outerCols.Empty() && !memo.AggregateOverloadExists(opt.ArrayAggOp, typ) { - panic(unimplementedWithIssueDetailf(35710, "", "can't execute a correlated ARRAY(...) over %s", typ)) - } - if err := types.CheckArrayElementType(typ); err != nil { panic(err) } @@ -169,6 +162,7 @@ func (b *Builder) buildScalar( OriginalExpr: s.Subquery, Ordering: s.ordering, RequestedCol: inCol, + WithinUDF: b.insideUDF, } out = b.factory.ConstructArrayFlatten(s.node, &subqueryPrivate) diff --git a/pkg/sql/opt/xform/testdata/rules/cycle b/pkg/sql/opt/xform/testdata/rules/cycle index b6ad54a309af..52441712f243 100644 --- a/pkg/sql/opt/xform/testdata/rules/cycle +++ b/pkg/sql/opt/xform/testdata/rules/cycle @@ -106,7 +106,7 @@ memo (not optimized, ~6KB, required=[], cycle=[G1->G4->G6->G9->G10->G12->G13->G1 ├── G7: (variable v) ├── G8: (const 1) ├── G9: (scalar-list G10 G11) - ├── G10: (subquery G12 &{ 0 unknown false}) + ├── G10: (subquery G12 &{ 0 false unknown false}) ├── G11: (false) ├── G12: (project G13 G14) ├── G13: (limit G1 G8)