From 9ece68131d035e23d9ed59cccdcdfda52ef54372 Mon Sep 17 00:00:00 2001
From: Radu Berinde <radu.berinde@gmail.com>
Date: Mon, 21 Mar 2016 14:45:10 -0400
Subject: [PATCH] sql: prefer order-matching index if there is a limit

In #4925, we observed ineffective planning for a query in the photos app. We
prefer to use the primary index and sort rather than use a non-covering index
which makes sense in general (non-covering indices require an expensive
indexJoin) but in this case we also had a limit. In such a case using the index
would require looking only at the first rows instead of getting all matching
rows and sorting.

In this change we tweak the index selection: if we have a reasonable limit, we
give a "boost" to all indices that match the ordering exactly. The boost exactly
offsets the non-covering index penalty.

In addition to the new tests, I also verified the photo app query in #4925 now
uses the index.

Fixes #5246.
---
 sql/backfill.go                        |  2 +-
 sql/limit.go                           |  6 +++-
 sql/select.go                          | 41 ++++++++++++++++++++------
 sql/testdata/explain_debug             | 10 +++----
 sql/testdata/select_non_covering_index | 39 ++++++++++++++++++++++++
 5 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/sql/backfill.go b/sql/backfill.go
index 838d1636d6e6..bc948d3ea5f2 100644
--- a/sql/backfill.go
+++ b/sql/backfill.go
@@ -166,7 +166,7 @@ func (p *planner) backfillBatch(b *client.Batch, tableDesc *TableDescriptor) *ro
 			desc:    *tableDesc,
 		}
 		scan.initDescDefaults()
-		rows := p.selectIndex(&selectNode{}, scan, nil, false)
+		rows := p.selectIndex(&selectNode{}, scan, nil, false, false)
 
 		// Construct a map from column ID to the index the value appears at within a
 		// row.
diff --git a/sql/limit.go b/sql/limit.go
index 18ca117dcf5a..4e6281a5c6b2 100644
--- a/sql/limit.go
+++ b/sql/limit.go
@@ -65,7 +65,11 @@ func (p *planner) evalLimit(limit *parser.Limit) (count, offset int64, err error
 			}
 
 			if dstDInt, ok := dstDatum.(parser.DInt); ok {
-				*datum.dst = int64(dstDInt)
+				val := int64(dstDInt)
+				if val < 0 {
+					return 0, 0, fmt.Errorf("negative value for %s", datum.name)
+				}
+				*datum.dst = val
 				continue
 			}
 
diff --git a/sql/select.go b/sql/select.go
index 9742257448ad..2d2e03c16530 100644
--- a/sql/select.go
+++ b/sql/select.go
@@ -269,6 +269,11 @@ func (p *planner) initSelect(
 		ordering = sort.Ordering().ordering
 	}
 
+	limitCount, limitOffset, err := p.evalLimit(limit)
+	if err != nil {
+		return nil, roachpb.NewError(err)
+	}
+
 	if scan, ok := s.table.node.(*scanNode); ok {
 		// Find the set of columns that we actually need values for. This is an
 		// optimization to avoid unmarshaling unnecessary values and is also
@@ -302,7 +307,14 @@ func (p *planner) initSelect(
 			}
 		}
 
-		plan := p.selectIndex(s, scan, ordering, grouping)
+		// If we have a reasonable limit, prefer an order matching index even if
+		// it is not covering - unless we are grouping, in which case the limit
+		// applies to the grouping results and not to the rows we scan.
+		var preferOrderMatchingIndex bool
+		if !grouping && len(ordering) > 0 && limitCount <= 1000-limitOffset {
+			preferOrderMatchingIndex = true
+		}
+		plan := p.selectIndex(s, scan, ordering, grouping, preferOrderMatchingIndex)
 
 		// Update s.table with the new plan.
 		s.table.node = plan
@@ -311,10 +323,6 @@ func (p *planner) initSelect(
 	s.ordering = s.computeOrdering(s.table.node.Ordering())
 
 	// Wrap this node as necessary.
-	limitCount, limitOffset, err := p.evalLimit(limit)
-	if err != nil {
-		return nil, roachpb.NewError(err)
-	}
 	return p.limit(limitCount, limitOffset, p.distinct(parsed, sort.wrap(group.wrap(s)))), nil
 }
 
@@ -645,6 +653,8 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 	return ordering
 }
 
+const nonCoveringIndexPenalty = 10
+
 // selectIndex analyzes the scanNode to determine if there is an index
 // available that can fulfill the query with a more restrictive scan.
 //
@@ -656,7 +666,11 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 // transformed into a set of spans to scan within the index.
 //
 // If grouping is true, the ordering is the desired ordering for grouping.
-func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping bool) planNode {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping,
+	preferOrderMatching bool) planNode {
 	if s.desc.isEmpty() || (s.filter == nil && ordering == nil) {
 		// No table or no where-clause and no ordering.
 		s.initOrdering(0)
@@ -737,7 +751,7 @@ func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrder
 
 	if ordering != nil {
 		for _, c := range candidates {
-			c.analyzeOrdering(sel, s, ordering)
+			c.analyzeOrdering(sel, s, ordering, preferOrderMatching)
 		}
 	}
 
@@ -864,7 +878,7 @@ func (v *indexInfo) init(s *scanNode) {
 			v.cost += float64(1 + len(v.desc.Columns) - len(v.desc.PrimaryIndex.ColumnIDs))
 			// Non-covering indexes are significantly more expensive than covering
 			// indexes.
-			v.cost *= 10
+			v.cost *= nonCoveringIndexPenalty
 		}
 	}
 }
@@ -891,7 +905,11 @@ func (v *indexInfo) analyzeExprs(exprs []parser.Exprs) {
 // analyzeOrdering analyzes the ordering provided by the index and determines
 // if it matches the ordering requested by the query. Non-matching orderings
 // increase the cost of using the index.
-func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering) {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering,
+	preferOrderMatching bool) {
 	// Compute the prefix of the index for which we have exact constraints. This
 	// prefix is inconsequential for ordering because the values are identical.
 	v.exactPrefix = exactPrefix(v.constraints)
@@ -918,6 +936,11 @@ func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering co
 	weight := float64(len(ordering)+1) / float64(match+1)
 	v.cost *= weight
 
+	if match == len(ordering) && preferOrderMatching {
+		// Offset the non-covering index cost penalty.
+		v.cost *= (1.0 / nonCoveringIndexPenalty)
+	}
+
 	if log.V(2) {
 		log.Infof("%s: analyzeOrdering: weight=%0.2f reverse=%v index=%d requested=%d",
 			v.index.Name, weight, v.reverse, indexOrdering, ordering)
diff --git a/sql/testdata/explain_debug b/sql/testdata/explain_debug
index db817be3d1e5..da2876f102c9 100644
--- a/sql/testdata/explain_debug
+++ b/sql/testdata/explain_debug
@@ -68,12 +68,10 @@ EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC LIMIT 1 OFFSET 1
 ----
-0  /abc/primary/1/'one'    NULL               PARTIAL
-0  /abc/primary/1/'one'/c  1.1                BUFFERED
-1  /abc/primary/2/'two'    NULL               BUFFERED
-2  /abc/primary/3/'three'  NULL               BUFFERED
-0  0                       (2, 'two', NULL)   FILTERED
-1  1                       (3, 'three', NULL) ROW
+0  /abc/foo/'two'          /2    PARTIAL   
+0  /abc/primary/2/'two'    NULL  FILTERED  
+1  /abc/foo/'three'        /3    PARTIAL   
+1  /abc/primary/3/'three'  NULL  ROW       
 
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc WHERE a = 2
diff --git a/sql/testdata/select_non_covering_index b/sql/testdata/select_non_covering_index
index 86dbb577199b..2d2c1569b227 100644
--- a/sql/testdata/select_non_covering_index
+++ b/sql/testdata/select_non_covering_index
@@ -84,3 +84,42 @@ EXPLAIN SELECT * FROM t WHERE c > 0 AND d = 8
 0 index-join
 1 scan       t@c /1-
 1 scan       t@primary
+
+# The following testcases verify that when we have a small limit, we prefer an
+# order-matching index.
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c
+----
+0 sort       +c
+1 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5
+----
+0 limit      count: 5, offset: 0
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c OFFSET 5
+----
+0 limit      count: ALL, offset: 5
+1 sort       +c
+2 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5 OFFSET 5
+----
+0 limit      count: 5, offset: 5
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 1000000
+----
+0 limit      count: 1000000, offset: 0
+1 sort       +c (top 1000000)
+2 scan       t@primary -