sql: prefer order-matching index if there is a limit

In cockroachdb#4925, we observed ineffective planning for a query in the photos app. We prefer to use the primary index and sort rather than use a non-covering index which makes sense in general (non-covering indices require an expensive indexJoin) but in this case we also had a limit. In such a case using the index would require looking only at the first rows instead of getting all matching rows and sorting. In this change we tweak the index selection: if we have a reasonable limit, we give a "boost" to all indices that match the ordering exactly. The boost exactly offsets the non-covering index penalty. In addition to the new tests, I also verified the photo app query in cockroachdb#4925 now uses the index. Fixes cockroachdb#5246.
RaduBerinde · Mar 22, 2016 · aa372ba · aa372ba
1 parent 16c80a2
commit aa372ba
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 17 deletions.
diff --git a/sql/backfill.go b/sql/backfill.go
@@ -166,7 +166,7 @@ func (p *planner) backfillBatch(b *client.Batch, tableDesc *TableDescriptor) *ro
 			desc:    *tableDesc,
 		}
 		scan.initDescDefaults()
-		rows := p.selectIndex(&selectNode{}, scan, nil, false)
+		rows := p.selectIndex(&selectNode{}, scan, nil, false, false)
 
 		// Construct a map from column ID to the index the value appears at within a
 		// row.

diff --git a/sql/limit.go b/sql/limit.go
@@ -65,7 +65,11 @@ func (p *planner) evalLimit(limit *parser.Limit) (count, offset int64, err error
 			}
 
 			if dstDInt, ok := dstDatum.(parser.DInt); ok {
-				*datum.dst = int64(dstDInt)
+				val := int64(dstDInt)
+				if val < 0 {
+					return 0, 0, fmt.Errorf("negative value for %s", datum.name)
+				}
+				*datum.dst = val
 				continue
 			}
 

diff --git a/sql/select.go b/sql/select.go
@@ -269,6 +269,11 @@ func (p *planner) initSelect(
 		ordering = sort.Ordering().ordering
 	}
 
+	limitCount, limitOffset, err := p.evalLimit(limit)
+	if err != nil {
+		return nil, roachpb.NewError(err)
+	}
+
 	if scan, ok := s.table.node.(*scanNode); ok {
 		// Find the set of columns that we actually need values for. This is an
 		// optimization to avoid unmarshaling unnecessary values and is also
@@ -302,7 +307,14 @@ func (p *planner) initSelect(
 			}
 		}
 
-		plan := p.selectIndex(s, scan, ordering, grouping)
+		// If we have a reasonable limit, prefer an order matching index even if
+		// it is not covering - unless we are grouping, in which case the limit
+		// applies to the grouping results and not to the rows we scan.
+		var preferOrderMatchingIndex bool
+		if !grouping && len(ordering) > 0 && limitCount <= 1000-limitOffset {
+			preferOrderMatchingIndex = true
+		}
+		plan := p.selectIndex(s, scan, ordering, grouping, preferOrderMatchingIndex)
 
 		// Update s.table with the new plan.
 		s.table.node = plan
@@ -311,10 +323,6 @@ func (p *planner) initSelect(
 	s.ordering = s.computeOrdering(s.table.node.Ordering())
 
 	// Wrap this node as necessary.
-	limitCount, limitOffset, err := p.evalLimit(limit)
-	if err != nil {
-		return nil, roachpb.NewError(err)
-	}
 	return p.limit(limitCount, limitOffset, p.distinct(parsed, sort.wrap(group.wrap(s)))), nil
 }
 
@@ -645,6 +653,8 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 	return ordering
 }
 
+const nonCoveringIndexPenalty = 10
+
 // selectIndex analyzes the scanNode to determine if there is an index
 // available that can fulfill the query with a more restrictive scan.
 //
@@ -656,7 +666,11 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 // transformed into a set of spans to scan within the index.
 //
 // If grouping is true, the ordering is the desired ordering for grouping.
-func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping bool) planNode {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping,
+	preferOrderMatching bool) planNode {
 	if s.desc.isEmpty() || (s.filter == nil && ordering == nil) {
 		// No table or no where-clause and no ordering.
 		s.initOrdering(0)
@@ -737,7 +751,7 @@ func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrder
 
 	if ordering != nil {
 		for _, c := range candidates {
-			c.analyzeOrdering(sel, s, ordering)
+			c.analyzeOrdering(sel, s, ordering, preferOrderMatching)
 		}
 	}
 
@@ -864,7 +878,7 @@ func (v *indexInfo) init(s *scanNode) {
 			v.cost += float64(1 + len(v.desc.Columns) - len(v.desc.PrimaryIndex.ColumnIDs))
 			// Non-covering indexes are significantly more expensive than covering
 			// indexes.
-			v.cost *= 10
+			v.cost *= nonCoveringIndexPenalty
 		}
 	}
 }
@@ -891,7 +905,11 @@ func (v *indexInfo) analyzeExprs(exprs []parser.Exprs) {
 // analyzeOrdering analyzes the ordering provided by the index and determines
 // if it matches the ordering requested by the query. Non-matching orderings
 // increase the cost of using the index.
-func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering) {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering,
+	preferOrderMatching bool) {
 	// Compute the prefix of the index for which we have exact constraints. This
 	// prefix is inconsequential for ordering because the values are identical.
 	v.exactPrefix = exactPrefix(v.constraints)
@@ -918,6 +936,11 @@ func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering co
 	weight := float64(len(ordering)+1) / float64(match+1)
 	v.cost *= weight
 
+	if match == len(ordering) && preferOrderMatching {
+		// Offset the non-covering index cost penalty.
+		v.cost *= (1.0 / nonCoveringIndexPenalty)
+	}
+
 	if log.V(2) {
 		log.Infof("%s: analyzeOrdering: weight=%0.2f reverse=%v index=%d requested=%d",
 			v.index.Name, weight, v.reverse, indexOrdering, ordering)

diff --git a/sql/testdata/explain_debug b/sql/testdata/explain_debug
@@ -68,12 +68,10 @@ EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC LIMIT 1 OFFSET 1
 ----
-0  /abc/primary/1/'one'    NULL               PARTIAL
-0  /abc/primary/1/'one'/c  1.1                BUFFERED
-1  /abc/primary/2/'two'    NULL               BUFFERED
-2  /abc/primary/3/'three'  NULL               BUFFERED
-0  0                       (2, 'two', NULL)   FILTERED
-1  1                       (3, 'three', NULL) ROW
+0  /abc/foo/'two'          /2    PARTIAL   
+0  /abc/primary/2/'two'    NULL  FILTERED  
+1  /abc/foo/'three'        /3    PARTIAL   
+1  /abc/primary/3/'three'  NULL  ROW       
 
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc WHERE a = 2

diff --git a/sql/testdata/select b/sql/testdata/select
@@ -168,6 +168,12 @@ SELECT * FROM xyzw LIMIT a
 query error argument of OFFSET must not contain variables
 SELECT * FROM xyzw OFFSET a
 
+query error negative value for LIMIT
+SELECT * FROM xyzw LIMIT -100
+
+query error negative value for OFFSET
+SELECT * FROM xyzw OFFSET -100
+
 query error unsupported binary operator: <int> \+ <float>
 SELECT * FROM xyzw OFFSET 1 + 0.0
 

diff --git a/sql/testdata/select_non_covering_index b/sql/testdata/select_non_covering_index
@@ -84,3 +84,42 @@ EXPLAIN SELECT * FROM t WHERE c > 0 AND d = 8
 0 index-join
 1 scan       t@c /1-
 1 scan       t@primary
+
+# The following testcases verify that when we have a small limit, we prefer an
+# order-matching index.
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c
+----
+0 sort       +c
+1 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5
+----
+0 limit      count: 5, offset: 0
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c OFFSET 5
+----
+0 limit      count: ALL, offset: 5
+1 sort       +c
+2 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5 OFFSET 5
+----
+0 limit      count: 5, offset: 5
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 1000000
+----
+0 limit      count: 1000000, offset: 0
+1 sort       +c (top 1000000)
+2 scan       t@primary -