cockroachdb · RaduBerinde · Mar 22, 2016 · Mar 21, 2016 · Mar 21, 2016
diff --git a/sql/backfill.go b/sql/backfill.go
@@ -166,7 +166,7 @@ func (p *planner) backfillBatch(b *client.Batch, tableDesc *TableDescriptor) *ro
 			desc:    *tableDesc,
 		}
 		scan.initDescDefaults()
-		rows := p.selectIndex(&selectNode{}, scan, nil, false)
+		rows := p.selectIndex(&selectNode{}, scan, nil, false, false)
 
 		// Construct a map from column ID to the index the value appears at within a
 		// row.

diff --git a/sql/limit.go b/sql/limit.go
@@ -25,60 +25,72 @@ import (
 	"github.com/cockroachdb/cockroach/util"
 )
 
-// limit constructs a limitNode based on the LIMIT and OFFSET clauses.
-func (p *planner) limit(limit *parser.Limit, plan planNode) (planNode, error) {
+// evalLimit evaluates the Count and Offset fields. If Count is missing, the
+// value is MaxInt64. If Offset is missing, the value is 0
+func (p *planner) evalLimit(limit *parser.Limit) (count, offset int64, err error) {
+	count = math.MaxInt64
+	offset = 0
+
 	if limit == nil {
-		return plan, nil
+		return count, offset, nil
 	}
 
-	var count, offset int64
-
 	data := []struct {
-		name       string
-		src        parser.Expr
-		dst        *int64
-		defaultVal int64
+		name string
+		src  parser.Expr
+		dst  *int64
 	}{
-		{"LIMIT", limit.Count, &count, math.MaxInt64},
-		{"OFFSET", limit.Offset, &offset, 0},
+		{"LIMIT", limit.Count, &count},
+		{"OFFSET", limit.Offset, &offset},
 	}
 
 	for _, datum := range data {
-		if datum.src == nil {
-			*datum.dst = datum.defaultVal
-		} else {
+		if datum.src != nil {
 			if parser.ContainsVars(datum.src) {
-				return nil, util.Errorf("argument of %s must not contain variables", datum.name)
+				return 0, 0, util.Errorf("argument of %s must not contain variables", datum.name)
 			}
 
 			normalized, err := p.parser.NormalizeExpr(p.evalCtx, datum.src)
 			if err != nil {
-				return nil, err
+				return 0, 0, err
 			}
 			dstDatum, err := normalized.Eval(p.evalCtx)
 			if err != nil {
-				return nil, err
+				return 0, 0, err
 			}
 
 			if dstDatum == parser.DNull {
-				*datum.dst = datum.defaultVal
+				// Use the default value.
 				continue
 			}
 
 			if dstDInt, ok := dstDatum.(parser.DInt); ok {
-				*datum.dst = int64(dstDInt)
+				val := int64(dstDInt)
+				if val < 0 {
+					return 0, 0, fmt.Errorf("negative value for %s", datum.name)
+				}
+				*datum.dst = val
 				continue
 			}
 
-			return nil, fmt.Errorf("argument of %s must be type %s, not type %s", datum.name, parser.DummyInt.Type(), dstDatum.Type())
+			return 0, 0, fmt.Errorf("argument of %s must be type %s, not type %s",
+				datum.name, parser.DummyInt.Type(), dstDatum.Type())
 		}
 	}
+	return count, offset, nil
+}
+
+// limit constructs a limitNode based on the LIMIT and OFFSET clauses.
+func (p *planner) limit(count, offset int64, plan planNode) planNode {
+	if count == math.MaxInt64 && offset == 0 {
+		return plan
+	}
 
 	if count != math.MaxInt64 {
 		plan.SetLimitHint(offset+count, false /* hard */)
 	}
 
-	return &limitNode{planNode: plan, count: count, offset: offset}, nil
+	return &limitNode{planNode: plan, count: count, offset: offset}
 }
 
 type limitNode struct {

diff --git a/sql/select.go b/sql/select.go
@@ -199,12 +199,11 @@ func (p *planner) Select(n *parser.Select, autoCommit bool) (planNode, *roachpb.
 		if pberr != nil {
 			return nil, pberr
 		}
-		var err error
-		plan, err = p.limit(limit, sort.wrap(plan))
+		count, offset, err := p.evalLimit(limit)
 		if err != nil {
 			return nil, roachpb.NewError(err)
 		}
-		return plan, nil
+		return p.limit(count, offset, sort.wrap(plan)), nil
 	}
 }
 
@@ -270,39 +269,52 @@ func (p *planner) initSelect(
 		ordering = sort.Ordering().ordering
 	}
 
+	limitCount, limitOffset, err := p.evalLimit(limit)
+	if err != nil {
+		return nil, roachpb.NewError(err)
+	}
+
 	if scan, ok := s.table.node.(*scanNode); ok {
-		// Find the set of columns that we actually need values for. This is an optimization to avoid
-		// unmarshaling unnecessary values and is also used for index selection.
+		// Find the set of columns that we actually need values for. This is an
+		// optimization to avoid unmarshaling unnecessary values and is also
+		// used for index selection.
 		neededCols := make([]bool, len(s.table.columns))
 		for i := range neededCols {
 			_, ok := s.qvals[columnRef{&s.table, i}]
 			neededCols[i] = ok
 		}
 		scan.setNeededColumns(neededCols)
 
-		// If we are only preparing, the filter expression can contain unexpanded subqueries which
-		// are not supported by splitFilter.
+		// If we are only preparing, the filter expression can contain
+		// unexpanded subqueries which are not supported by splitFilter.
 		if !p.evalCtx.PrepareOnly {
 			// Compute a filter expression for the scan node.
 			convFunc := func(expr parser.VariableExpr) (bool, parser.VariableExpr) {
 				qval := expr.(*qvalue)
 				if qval.colRef.table != &s.table {
-					// TODO(radu): when we will support multiple tables, this will be a valid
-					// case.
+					// TODO(radu): when we will support multiple tables, this
+					// will be a valid case.
 					panic("scan qvalue refers to unknown table")
 				}
 				return true, scan.getQValue(qval.colRef.colIdx)
 			}
 
 			scan.filter, s.filter = splitFilter(s.filter, convFunc)
 			if s.filter != nil {
-				// Right now we support only one table, so the entire expression should be
-				// converted.
+				// Right now we support only one table, so the entire expression
+				// should be converted.
 				panic(fmt.Sprintf("residual filter `%s` (scan filter `%s`)", s.filter, scan.filter))
 			}
 		}
 
-		plan := p.selectIndex(s, scan, ordering, grouping)
+		// If we have a reasonable limit, prefer an order matching index even if
+		// it is not covering - unless we are grouping, in which case the limit
+		// applies to the grouping results and not to the rows we scan.
+		var preferOrderMatchingIndex bool
+		if !grouping && len(ordering) > 0 && limitCount <= 1000-limitOffset {
+			preferOrderMatchingIndex = true
+		}
+		plan := p.selectIndex(s, scan, ordering, grouping, preferOrderMatchingIndex)
 
 		// Update s.table with the new plan.
 		s.table.node = plan
@@ -311,11 +323,7 @@ func (p *planner) initSelect(
 	s.ordering = s.computeOrdering(s.table.node.Ordering())
 
 	// Wrap this node as necessary.
-	limitNode, err := p.limit(limit, p.distinct(parsed, sort.wrap(group.wrap(s))))
-	if err != nil {
-		return nil, roachpb.NewError(err)
-	}
-	return limitNode, nil
+	return p.limit(limitCount, limitOffset, p.distinct(parsed, sort.wrap(group.wrap(s)))), nil
 }
 
 // Initializes the table node, given the parsed select expression
@@ -645,6 +653,8 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 	return ordering
 }
 
+const nonCoveringIndexPenalty = 10
+
 // selectIndex analyzes the scanNode to determine if there is an index
 // available that can fulfill the query with a more restrictive scan.
 //
@@ -656,7 +666,11 @@ func (s *selectNode) computeOrdering(fromOrder orderingInfo) orderingInfo {
 // transformed into a set of spans to scan within the index.
 //
 // If grouping is true, the ordering is the desired ordering for grouping.
-func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping bool) planNode {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrdering, grouping,
+	preferOrderMatching bool) planNode {
 	if s.desc.isEmpty() || (s.filter == nil && ordering == nil) {
 		// No table or no where-clause and no ordering.
 		s.initOrdering(0)
@@ -737,7 +751,7 @@ func (p *planner) selectIndex(sel *selectNode, s *scanNode, ordering columnOrder
 
 	if ordering != nil {
 		for _, c := range candidates {
-			c.analyzeOrdering(sel, s, ordering)
+			c.analyzeOrdering(sel, s, ordering, preferOrderMatching)
 		}
 	}
 
@@ -864,7 +878,7 @@ func (v *indexInfo) init(s *scanNode) {
 			v.cost += float64(1 + len(v.desc.Columns) - len(v.desc.PrimaryIndex.ColumnIDs))
 			// Non-covering indexes are significantly more expensive than covering
 			// indexes.
-			v.cost *= 10
+			v.cost *= nonCoveringIndexPenalty
 		}
 	}
 }
@@ -891,7 +905,11 @@ func (v *indexInfo) analyzeExprs(exprs []parser.Exprs) {
 // analyzeOrdering analyzes the ordering provided by the index and determines
 // if it matches the ordering requested by the query. Non-matching orderings
 // increase the cost of using the index.
-func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering) {
+//
+// If preferOrderMatching is true, we prefer an index that matches the desired
+// ordering completely, even if it is not a covering index.
+func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering columnOrdering,
+	preferOrderMatching bool) {
 	// Compute the prefix of the index for which we have exact constraints. This
 	// prefix is inconsequential for ordering because the values are identical.
 	v.exactPrefix = exactPrefix(v.constraints)
@@ -918,6 +936,11 @@ func (v *indexInfo) analyzeOrdering(sel *selectNode, scan *scanNode, ordering co
 	weight := float64(len(ordering)+1) / float64(match+1)
 	v.cost *= weight
 
+	if match == len(ordering) && preferOrderMatching {
+		// Offset the non-covering index cost penalty.
+		v.cost *= (1.0 / nonCoveringIndexPenalty)
+	}
+
 	if log.V(2) {
 		log.Infof("%s: analyzeOrdering: weight=%0.2f reverse=%v index=%d requested=%d",
 			v.index.Name, weight, v.reverse, indexOrdering, ordering)

diff --git a/sql/testdata/explain_debug b/sql/testdata/explain_debug
@@ -68,12 +68,10 @@ EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc ORDER BY b DESC LIMIT 1 OFFSET 1
 ----
-0  /abc/primary/1/'one'    NULL               PARTIAL
-0  /abc/primary/1/'one'/c  1.1                BUFFERED
-1  /abc/primary/2/'two'    NULL               BUFFERED
-2  /abc/primary/3/'three'  NULL               BUFFERED
-0  0                       (2, 'two', NULL)   FILTERED
-1  1                       (3, 'three', NULL) ROW
+0  /abc/foo/'two'          /2    PARTIAL   
+0  /abc/primary/2/'two'    NULL  FILTERED  
+1  /abc/foo/'three'        /3    PARTIAL   
+1  /abc/primary/3/'three'  NULL  ROW       
 
 query ITTT
 EXPLAIN (DEBUG) SELECT * FROM abc WHERE a = 2

diff --git a/sql/testdata/select b/sql/testdata/select
@@ -168,6 +168,12 @@ SELECT * FROM xyzw LIMIT a
 query error argument of OFFSET must not contain variables
 SELECT * FROM xyzw OFFSET a
 
+query error negative value for LIMIT
+SELECT * FROM xyzw LIMIT -100
+
+query error negative value for OFFSET
+SELECT * FROM xyzw OFFSET -100
+
 query error unsupported binary operator: <int> \+ <float>
 SELECT * FROM xyzw OFFSET 1 + 0.0
 

diff --git a/sql/testdata/select_non_covering_index b/sql/testdata/select_non_covering_index
@@ -84,3 +84,42 @@ EXPLAIN SELECT * FROM t WHERE c > 0 AND d = 8
 0 index-join
 1 scan       t@c /1-
 1 scan       t@primary
+
+# The following testcases verify that when we have a small limit, we prefer an
+# order-matching index.
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c
+----
+0 sort       +c
+1 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5
+----
+0 limit      count: 5, offset: 0
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c OFFSET 5
+----
+0 limit      count: ALL, offset: 5
+1 sort       +c
+2 scan       t@primary -
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 5 OFFSET 5
+----
+0 limit      count: 5, offset: 5
+1 index-join
+2 scan       t@c -
+2 scan       t@primary
+
+query ITT
+EXPLAIN SELECT * FROM t ORDER BY c LIMIT 1000000
+----
+0 limit      count: 1000000, offset: 0
+1 sort       +c (top 1000000)
+2 scan       t@primary -