opt: fix error with DISTINCT ON and ORDER BY ASC NULLS LAST

This commit fixes an error that could occur in the optbuilder when planning a query with DISTINCT ON and a non-standard nulls ordering. The optimizer supports queries with a non-standard nulls ordering by projecting a column with the expression (col IS NULL) and adding it to the ordering. Since we require that DISTINCT ON columns must be the prefix of any ordering columns, we must account for the new ordering column when building DISTINCT ON. A previous bug fix for cockroachdb#90763 caused the new column to be simply ignored when building DISTINCT ON, but this was insufficient. We need to actually include the new column among the DISTINCT ON columns. This commit makes that change. Fixes cockroachdb#107839 Release note (bug fix): Fixed a spurious error "no data source matches prefix" that could occur during planning for a query with DISTINCT ON and ORDER BY ASC NULLS LAST or ORDER BY DESC NULLS FIRST.
rytaft · Aug 7, 2023 · 9dcdbcd · 9dcdbcd
1 parent 898c592
commit 9dcdbcd
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 31 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/distinct_on b/pkg/sql/logictest/testdata/logic_test/distinct_on
@@ -405,3 +405,32 @@ Action
 Biography
 Crime
 NULL
+
+# Regression test for #107839. More fixes for NULLS FIRST/LAST with DISTINCT ON.
+statement ok
+CREATE TABLE t1 (id int, str text);
+CREATE TABLE t2 (id int, num int);
+INSERT INTO t1 VALUES (1, 'hello'), (2, NULL);
+INSERT INTO t2 VALUES (1, 1), (2, 2), (NULL, NULL)
+
+query II
+SELECT
+DISTINCT ON (t2.id)
+t2.*
+FROM t1, t2
+ORDER BY t2.id DESC NULLS FIRST
+----
+NULL  NULL
+2     2
+1     1
+
+query II
+SELECT
+DISTINCT ON (t2.id)
+t2.*
+FROM t1, t2
+ORDER BY t2.id ASC NULLS LAST
+----
+1     1
+2     2
+NULL  NULL
diff --git a/pkg/sql/opt/optbuilder/distinct.go b/pkg/sql/opt/optbuilder/distinct.go
@@ -89,23 +89,29 @@ func (b *Builder) buildDistinctOn(
 	var seen opt.ColSet
 	for _, col := range inScope.ordering {
 		if !distinctOnCols.Contains(col.ID()) {
+			colIsValid := false
 			scopeCol := inScope.getColumn(col.ID())
 			if scopeCol != nil {
 				if isExpr, ok := scopeCol.scalar.(*memo.IsExpr); ok {
 					if _, ok := isExpr.Right.(*memo.NullExpr); ok {
 						if v, ok := isExpr.Left.(*memo.VariableExpr); ok {
 							if distinctOnCols.Contains(v.Col) {
 								// We have a col IS NULL expression (case 3 above).
-								continue
+								// Add the new column to distinctOnCols, since it doesn't change
+								// the semantics of the DISTINCT ON.
+								distinctOnCols.Add(col.ID())
+								colIsValid = true
 							}
 						}
 					}
 				}
 			}
-			panic(pgerror.Newf(
-				pgcode.InvalidColumnReference,
-				"SELECT DISTINCT ON expressions must match initial ORDER BY expressions",
-			))
+			if !colIsValid {
+				panic(pgerror.Newf(
+					pgcode.InvalidColumnReference,
+					"SELECT DISTINCT ON expressions must match initial ORDER BY expressions",
+				))
+			}
 		}
 		seen.Add(col.ID())
 		if seen.Equals(distinctOnCols) {

diff --git a/pkg/sql/opt/optbuilder/testdata/distinct_on b/pkg/sql/opt/optbuilder/testdata/distinct_on
@@ -507,30 +507,87 @@ ORDER BY
   "genre" ASC NULLS LAST,
   "id" ASC NULLS LAST
 ----
+distinct-on
+ ├── columns: id:1!null name:2 genre:3  [hidden: nulls_ordering_genre:6!null]
+ ├── grouping columns: genre:3 nulls_ordering_genre:6!null
+ ├── internal-ordering: +7,+1 opt(3,6)
+ ├── ordering: +6,+3
+ ├── sort
+ │    ├── columns: id:1!null name:2 genre:3 nulls_ordering_genre:6!null nulls_ordering_id:7!null
+ │    ├── ordering: +6,+3,+7,+1
+ │    └── project
+ │         ├── columns: nulls_ordering_genre:6!null nulls_ordering_id:7!null id:1!null name:2 genre:3
+ │         ├── scan author
+ │         │    └── columns: id:1!null name:2 genre:3 crdb_internal_mvcc_timestamp:4 tableoid:5
+ │         └── projections
+ │              ├── genre:3 IS NULL [as=nulls_ordering_genre:6]
+ │              └── id:1 IS NULL [as=nulls_ordering_id:7]
+ └── aggregations
+      ├── first-agg [as=id:1]
+      │    └── id:1
+      └── first-agg [as=name:2]
+           └── name:2
+
+# Regression test for #107839. More fixes for NULLS FIRST/LAST with DISTINCT ON.
+
+exec-ddl
+CREATE TABLE t1 (id int, str text)
+----
+
+exec-ddl
+CREATE TABLE t2 (id int, num int)
+----
+
+build
+SELECT
+DISTINCT ON (t2.id)
+t1.id,
+t1.str
+FROM t1 JOIN t2 ON t1.id = t2.id
+ORDER BY t2.id DESC NULLS FIRST
+----
 sort
- ├── columns: id:1!null name:2 genre:3  [hidden: nulls_ordering_genre:8!null nulls_ordering_id:9!null]
- ├── ordering: +8,+3,+9,+1
- └── project
-      ├── columns: nulls_ordering_genre:8!null nulls_ordering_id:9!null id:1!null name:2 genre:3
-      ├── distinct-on
-      │    ├── columns: id:1!null name:2 genre:3
-      │    ├── grouping columns: genre:3
-      │    ├── internal-ordering: +6,+7,+1 opt(3)
-      │    ├── sort
-      │    │    ├── columns: id:1!null name:2 genre:3 nulls_ordering_genre:6!null nulls_ordering_id:7!null
-      │    │    ├── ordering: +7,+1 opt(3,6)
-      │    │    └── project
-      │    │         ├── columns: nulls_ordering_genre:6!null nulls_ordering_id:7!null id:1!null name:2 genre:3
-      │    │         ├── scan author
-      │    │         │    └── columns: id:1!null name:2 genre:3 crdb_internal_mvcc_timestamp:4 tableoid:5
-      │    │         └── projections
-      │    │              ├── genre:3 IS NULL [as=nulls_ordering_genre:6]
-      │    │              └── id:1 IS NULL [as=nulls_ordering_id:7]
-      │    └── aggregations
-      │         ├── first-agg [as=id:1]
-      │         │    └── id:1
-      │         └── first-agg [as=name:2]
-      │              └── name:2
-      └── projections
-           ├── genre:3 IS NULL [as=nulls_ordering_genre:8]
-           └── id:1 IS NULL [as=nulls_ordering_id:9]
+ ├── columns: id:1!null str:2  [hidden: t2.id:6!null nulls_ordering_id:11!null]
+ ├── ordering: -11,-6
+ └── distinct-on
+      ├── columns: t1.id:1!null str:2 t2.id:6!null nulls_ordering_id:11!null
+      ├── grouping columns: t2.id:6!null nulls_ordering_id:11!null
+      ├── project
+      │    ├── columns: nulls_ordering_id:11!null t1.id:1!null str:2 t2.id:6!null
+      │    ├── inner-join (hash)
+      │    │    ├── columns: t1.id:1!null str:2 t1.rowid:3!null t1.crdb_internal_mvcc_timestamp:4 t1.tableoid:5 t2.id:6!null num:7 t2.rowid:8!null t2.crdb_internal_mvcc_timestamp:9 t2.tableoid:10
+      │    │    ├── scan t1
+      │    │    │    └── columns: t1.id:1 str:2 t1.rowid:3!null t1.crdb_internal_mvcc_timestamp:4 t1.tableoid:5
+      │    │    ├── scan t2
+      │    │    │    └── columns: t2.id:6 num:7 t2.rowid:8!null t2.crdb_internal_mvcc_timestamp:9 t2.tableoid:10
+      │    │    └── filters
+      │    │         └── t1.id:1 = t2.id:6
+      │    └── projections
+      │         └── t2.id:6 IS NULL [as=nulls_ordering_id:11]
+      └── aggregations
+           ├── first-agg [as=t1.id:1]
+           │    └── t1.id:1
+           └── first-agg [as=str:2]
+                └── str:2
+
+# Because of the hack we use to allow ORDER BY NULLS FIRST / LAST with DISTINCT
+# ON, we also support this query. Postgres doesn't support it, but it doesn't
+# seem like a problem that we do.
+build
+SELECT DISTINCT ON (id) * FROM t1 ORDER BY id IS NULL, id
+----
+sort
+ ├── columns: id:1 str:2  [hidden: column6:6!null]
+ ├── ordering: +6,+1
+ └── distinct-on
+      ├── columns: id:1 str:2 column6:6!null
+      ├── grouping columns: id:1 column6:6!null
+      ├── project
+      │    ├── columns: column6:6!null id:1 str:2
+      │    ├── scan t1
+      │    │    └── columns: id:1 str:2 rowid:3!null crdb_internal_mvcc_timestamp:4 tableoid:5
+      │    └── projections
+      │         └── id:1 IS NULL [as=column6:6]
+      └── aggregations
+           └── first-agg [as=str:2]
+                └── str:2