sql: support array-flatten subqueries within UDFs

Array-flatten subqueries (e.g., `ARRAY(SELECT a FROM t)`) are now supported within UDFs. They are now converted to a normal subquery with a ScalarGroupBy if they exist within a UDF, even if they are uncorrelated. This allows them to be executed without any changes to the execbuilder or the evaluation logic of `tree.Routine`. Fixes #98738 Release note: None
cockroachdb · Mar 21, 2023 · 99169e8 · 99169e8
1 parent 5f5cf26
commit 99169e8
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 17 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/udf b/pkg/sql/logictest/testdata/logic_test/udf
@@ -2945,6 +2945,26 @@ SELECT all_fn(1), all_fn(2), all_fn(NULL::INT)
 NULL  false  NULL
 
 
+subtest array_flatten
+
+statement ok
+CREATE FUNCTION arr(x INT) RETURNS INT[] LANGUAGE SQL AS $$
+  SELECT ARRAY(VALUES (1), (2), (x));
+$$
+
+query T
+SELECT arr(10)
+----
+{1,2,10}
+
+query T
+SELECT arr(i) FROM generate_series(1, 3) g(i)
+----
+{1,2,1}
+{1,2,2}
+{1,2,3}
+
+
 subtest variadic
 
 # Variadic UDFS are not currently supported.
@@ -3054,10 +3074,10 @@ SELECT oid, proname, pronamespace, proowner, prolang, proleakproof, proisstrict,
 FROM pg_catalog.pg_proc WHERE proname IN ('f_93314', 'f_93314_alias', 'f_93314_comp', 'f_93314_comp_t')
 ORDER BY oid;
 ----
-100271  f_93314         105  1546506610  14  false  false  false  v  0  100270  ·  {}  NULL  SELECT i, e FROM test.public.t_93314 ORDER BY i LIMIT 1;
-100273  f_93314_alias   105  1546506610  14  false  false  false  v  0  100272  ·  {}  NULL  SELECT i, e FROM test.public.t_93314_alias ORDER BY i LIMIT 1;
-100277  f_93314_comp    105  1546506610  14  false  false  false  v  0  100274  ·  {}  NULL  SELECT (1, 2);
-100278  f_93314_comp_t  105  1546506610  14  false  false  false  v  0  100276  ·  {}  NULL  SELECT a, c FROM test.public.t_93314_comp LIMIT 1;
+100272  f_93314         105  1546506610  14  false  false  false  v  0  100271  ·  {}  NULL  SELECT i, e FROM test.public.t_93314 ORDER BY i LIMIT 1;
+100274  f_93314_alias   105  1546506610  14  false  false  false  v  0  100273  ·  {}  NULL  SELECT i, e FROM test.public.t_93314_alias ORDER BY i LIMIT 1;
+100278  f_93314_comp    105  1546506610  14  false  false  false  v  0  100275  ·  {}  NULL  SELECT (1, 2);
+100279  f_93314_comp_t  105  1546506610  14  false  false  false  v  0  100277  ·  {}  NULL  SELECT a, c FROM test.public.t_93314_comp LIMIT 1;
 
 # Regression test for #95240. Strict UDFs that are inlined should result in NULL
 # when presented with NULL arguments.

diff --git a/pkg/sql/opt/exec/execbuilder/scalar.go b/pkg/sql/opt/exec/execbuilder/scalar.go
@@ -495,6 +495,16 @@ func (b *Builder) buildArrayFlatten(
 		panic(errors.AssertionFailedf("input to ArrayFlatten should be uncorrelated"))
 	}
 
+	if b.planLazySubqueries {
+		// The NormalizeArrayFlattenToAgg rule should have converted an
+		// ArrayFlatten within a UDF into an aggregation.
+		// We don't yet convert an ArrayFlatten within a correlated subquery
+		// into an aggregation, so we return a decorrelation error.
+		// TODO(mgartner): Build an ArrayFlatten within a correlated subquery as
+		// a Routine, or apply NormalizeArrayFlattenToAgg to all ArrayFlattens.
+		return nil, b.decorrelationError()
+	}
+
 	root, err := b.buildRelational(af.Input)
 	if err != nil {
 		return nil, err
@@ -762,10 +772,6 @@ func (b *Builder) buildSubquery(
 		// because we don't need to optimize the subquery input any further.
 		// It's already been fully optimized because it is uncorrelated and has
 		// no outer columns.
-		//
-		// TODO(mgartner): Uncorrelated subqueries only need to be evaluated
-		// once. We should cache their result to avoid all this overhead for
-		// every invocation.
 		inputRowCount := int64(input.Relational().Statistics().RowCountIfAvailable())
 		withExprs := make([]builtWithExpr, len(b.withExprs))
 		copy(withExprs, b.withExprs)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/subquery b/pkg/sql/opt/exec/execbuilder/testdata/subquery
@@ -562,3 +562,10 @@ vectorized: true
       estimated row count: 1,000 (missing stats)
       table: corr@corr_pkey
       spans: FULL SCAN
+
+# Case where a correlated subquery contains an uncorrelated array-flatten
+# subquery.
+statement error could not decorrelate subquery
+SELECT
+  CASE WHEN k < 5 THEN (SELECT array(SELECT 1) FROM corr tmp WHERE k*10 = corr.k) END
+FROM corr
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/udf b/pkg/sql/opt/exec/execbuilder/testdata/udf
@@ -235,6 +235,30 @@ SELECT sub_fn4() FROM generate_series(1, 3)
 1
 1
 
+statement ok
+CREATE FUNCTION arr() RETURNS INT[] LANGUAGE SQL AS $$
+  SELECT ARRAY(VALUES (1), (2));
+$$
+
+# A query with a uncorrelated array-flatten within a UDF.
+query T
+EXPLAIN (VERBOSE) SELECT arr() FROM generate_series(1, 3)
+----
+distribution: local
+vectorized: true
+·
+• render
+│ columns: (arr)
+│ render arr: arr()
+│
+└── • project set
+    │ columns: (generate_series)
+    │ estimated row count: 10
+    │ render 0: generate_series(1, 3)
+    │
+    └── • emptyrow
+          columns: ()
+
 
 subtest regressions
 

diff --git a/pkg/sql/opt/norm/rules/scalar.opt b/pkg/sql/opt/norm/rules/scalar.opt
@@ -383,15 +383,16 @@ $input
 #        an aggregation). So it's desirable to perform this conversion in the
 #        interest of decorrelation.
 #
-# So the outcome is that we can perform uncorrelated ARRAY(...)s over any datatype,
-# and correlated ones only over the types that array_agg supports.
+# So the outcome is that we can perform uncorrelated ARRAY(...)s over any
+# datatype, and correlated ones only over the types that array_agg supports.
 #
 # Note that optbuilder should have already verified that if the input is
-# correlated, then we can array_agg over the input type.  Also note that the
-# Max1Row operator we introduce is guaranteed to be eliminated as
-# MakeArrayAggForFlatten will return a ScalarGroupBy.
+# correlated, then we can array_agg over the input type.
 [NormalizeArrayFlattenToAgg, Normalize]
-(ArrayFlatten $input:(HasOuterCols $input) $subquery:*)
+(ArrayFlatten
+    $input:*
+    $private:* & (CanNormalizeArrayFlatten $input $private)
+)
 =>
 (Coalesce
     [
@@ -403,7 +404,7 @@ $input
                         (ArrayAgg
                             (Variable
                                 $requestedCol:(SubqueryRequestedCol
-                                    $subquery
+                                    $private
                                 )
                             )
                         )
@@ -414,7 +415,7 @@ $input
                 ]
                 (MakeGrouping
                     (MakeEmptyColSet)
-                    (SubqueryOrdering $subquery)
+                    (SubqueryOrdering $private)
                 )
             )
             (MakeUnorderedSubquery)

diff --git a/pkg/sql/opt/norm/scalar_funcs.go b/pkg/sql/opt/norm/scalar_funcs.go
@@ -387,3 +387,9 @@ func (c *CustomFuncs) SplitTupleEq(lhs, rhs *memo.TupleExpr) memo.FiltersExpr {
 	}
 	return res
 }
+
+// CanNormalizeArrayFlatten returns true if the input is correlated or if the
+// ArrayFlatten exists within a UDF.
+func (c *CustomFuncs) CanNormalizeArrayFlatten(input memo.RelExpr, p *memo.SubqueryPrivate) bool {
+	return c.HasOuterCols(input) || p.WithinUDF
+}
diff --git a/pkg/sql/opt/norm/testdata/rules/scalar b/pkg/sql/opt/norm/testdata/rules/scalar
@@ -1985,6 +1985,50 @@ project
                 ├── columns: k:8!null
                 └── key: (8)
 
+exec-ddl
+CREATE FUNCTION arr() RETURNS INT[] LANGUAGE SQL AS $$
+  SELECT ARRAY(VALUES (1), (2));
+$$
+----
+
+# Should trigger for uncorrelated ArrayFlatten subqueries within a UDF
+norm expect=NormalizeArrayFlattenToAgg format=show-scalars
+SELECT arr()
+----
+values
+ ├── columns: arr:4
+ ├── cardinality: [1 - 1]
+ ├── volatile
+ ├── key: ()
+ ├── fd: ()-->(4)
+ └── tuple
+      └── udf: arr
+           └── body
+                └── values
+                     ├── columns: array:3
+                     ├── cardinality: [1 - 1]
+                     ├── key: ()
+                     ├── fd: ()-->(3)
+                     └── tuple
+                          └── coalesce
+                               ├── subquery
+                               │    └── scalar-group-by
+                               │         ├── columns: array_agg:2
+                               │         ├── cardinality: [1 - 1]
+                               │         ├── key: ()
+                               │         ├── fd: ()-->(2)
+                               │         ├── values
+                               │         │    ├── columns: column1:1!null
+                               │         │    ├── cardinality: [2 - 2]
+                               │         │    ├── tuple
+                               │         │    │    └── const: 1
+                               │         │    └── tuple
+                               │         │         └── const: 2
+                               │         └── aggregations
+                               │              └── array-agg [as=array_agg:2, outer=(1)]
+                               │                   └── variable: column1:1
+                               └── const: ARRAY[]
+
 exec-ddl
 CREATE TABLE pg_class (
      oid OID NULL,

diff --git a/pkg/sql/opt/ops/scalar.opt b/pkg/sql/opt/ops/scalar.opt
@@ -43,6 +43,10 @@ define SubqueryPrivate {
     # will eventually be output. It is only used for ArrayFlatten expressions.
     RequestedCol ColumnID
 
+    # WithinUDF is set to true if the subquery exists inside a UDFExpr. It is
+    # only used for ArrayFlatten expressions.
+    WithinUDF bool
+
     # Cmp is only used for AnyOp.
     Cmp Operator
 

diff --git a/pkg/sql/opt/optbuilder/scalar.go b/pkg/sql/opt/optbuilder/scalar.go
@@ -169,6 +169,7 @@ func (b *Builder) buildScalar(
 			OriginalExpr: s.Subquery,
 			Ordering:     s.ordering,
 			RequestedCol: inCol,
+			WithinUDF:    b.insideUDF,
 		}
 		out = b.factory.ConstructArrayFlatten(s.node, &subqueryPrivate)
 

diff --git a/pkg/sql/opt/xform/testdata/rules/cycle b/pkg/sql/opt/xform/testdata/rules/cycle
@@ -106,7 +106,7 @@ memo (not optimized, ~6KB, required=[], cycle=[G1->G4->G6->G9->G10->G12->G13->G1
  ├── G7: (variable v)
  ├── G8: (const 1)
  ├── G9: (scalar-list G10 G11)
- ├── G10: (subquery G12 &{<nil>  0 unknown false})
+ ├── G10: (subquery G12 &{<nil>  0 false unknown false})
  ├── G11: (false)
  ├── G12: (project G13 G14)
  ├── G13: (limit G1 G8)