storage: avoid blocking the command queue for scans with limits

SQL has a tendancy to create scans which cover a range's entire key span, though looking only to return a finite number of results. These requests end up blocking writes in the command queue, which block other reads, causing requsets to effectively serialize. In reality, when there is a scan with a limit, the actual affected key span ends up being a small subset of the range's key span. This change creates a new "pre-evaluation" execution path for read- only requests. When pre-evaluating, the batch still interacts with the command queue, but it neither waits for pre-requisites nor causes successor, dependent write batches from waiting on it. Instead, any prereqs or dependents are tracked while the read-only batch is immediately evaluated without blocking. On successful evaluation, the actual span(s) covered and recorded in the read-only batch's `BatchResponse` are compared to the pre-requisite and dependent commands. If there is no overlap, the timestamp cache is updated and the read-only batch response returned to the caller. If there is overlap, then the read-only batch is re-executed, but with the pre-evaluation option disabled, so that it proceeds to interact with the command queue in the traditional fashion, waiting on prereqs and making dependents wait on it. Release note: improved performance on workloads which mix OLAP queries with updates.
cockroachdb · Oct 25, 2018 · c2d3a00 · c2d3a00
1 parent 3ee3e10
commit c2d3a00
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 48 deletions.
diff --git a/pkg/storage/command_queue.go b/pkg/storage/command_queue.go
@@ -81,11 +81,12 @@ type CommandQueue struct {
 }
 
 type cmd struct {
-	id        int64
-	key       interval.Range
-	readOnly  bool
-	timestamp hlc.Timestamp
-	debugInfo summaryWriter
+	id          int64
+	key         interval.Range
+	readOnly    bool
+	preEvaluate bool // collects pre-requisites and dependents, instead of waiting
+	timestamp   hlc.Timestamp
+	debugInfo   summaryWriter
 
 	buffered bool // is this cmd buffered in readsBuffer
 	expanded bool // have the children been added
@@ -101,7 +102,10 @@ type cmd struct {
 	// in the prereq slice to avoid duplicates.
 	prereqIDs map[int64]struct{}
 
-	pending chan struct{} // closed when complete
+	// If pre-evaluating, collect commands which are prereqs or dependents.
+	prereqsAndDeps []*cmd
+
+	pending chan struct{} // closed when complete, or if pre-evaulating
 }
 
 // A summaryWriter is capable of writing a summary about itself. It is typically
@@ -277,6 +281,16 @@ func (c *cmd) ResolvePendingPrereq() {
 		}
 	}
 
+	// If the prereq is pre-evaluating, let it know that this command is
+	// a dependent.
+	if pre.preEvaluate {
+		pre.prereqsAndDeps = append(pre.prereqsAndDeps, c)
+	}
+	// If the command is pre-evaluating, keep track of prereqs.
+	if c.preEvaluate {
+		c.prereqsAndDeps = append(c.prereqsAndDeps, pre)
+	}
+
 	// Truncate the command's prerequisite list so that it no longer includes
 	// the first prerequisite. Before doing so, nil out prefix of slice to allow
 	// GC of the first command. Without this, large chunks of the dependency
@@ -289,7 +303,7 @@ func (c *cmd) ResolvePendingPrereq() {
 	delete(c.prereqIDs, pre.id)
 }
 
-// OptimisticallyResolvePrereqs removes all prerequisite in the cmd's prereq
+// OptimisticallyResolvePrereqs removes all prerequisites in the cmd's prereq
 // slice that have already finished without blocking on pending commands.
 // Prerequisite commands that are still pending or that were canceled are left
 // in the prereq slice.
@@ -313,6 +327,20 @@ func (c *cmd) OptimisticallyResolvePrereqs() {
 	(*c.prereqs) = (*c.prereqs)[:j]
 }
 
+// Overlaps returns whether the supplied range overlaps with any span
+// in the command.
+func (c *cmd) Overlaps(o interval.Range) bool {
+	if len(c.children) == 0 {
+		return interval.ExclusiveOverlapper.Overlap(o, c.key)
+	}
+	for _, child := range c.children {
+		if interval.ExclusiveOverlapper.Overlap(o, child.key) {
+			return true
+		}
+	}
+	return false
+}
+
 // NewCommandQueue returns a new command queue. The boolean specifies whether
 // to enable the covering span optimization. With this optimization, whenever
 // a command consisting of multiple spans is added, a covering span is computed
@@ -724,7 +752,7 @@ func (o *overlapHeap) PopOverlap() *cmd {
 //
 // Returns a nil `cmd` when no spans are given.
 func (cq *CommandQueue) add(
-	readOnly bool, timestamp hlc.Timestamp, prereqs []*cmd, spans []roachpb.Span,
+	readOnly, preEvaluate bool, timestamp hlc.Timestamp, prereqs []*cmd, spans []roachpb.Span,
 ) *cmd {
 	if len(spans) == 0 {
 		return nil
@@ -766,6 +794,7 @@ func (cq *CommandQueue) add(
 	cmd.id = cq.nextID()
 	cmd.key = coveringSpan.AsRange()
 	cmd.readOnly = readOnly
+	cmd.preEvaluate = preEvaluate
 	cmd.timestamp = timestamp
 	cmd.prereqsBuf = prereqs
 	cmd.prereqs = &cmd.prereqsBuf

diff --git a/pkg/storage/command_queue_test.go b/pkg/storage/command_queue_test.go
@@ -34,7 +34,7 @@ func getPrereqs(cq *CommandQueue, from, to roachpb.Key, readOnly bool) []*cmd {
 }
 
 func add(cq *CommandQueue, from, to roachpb.Key, readOnly bool, prereqs []*cmd) *cmd {
-	return cq.add(readOnly, zeroTS, prereqs, []roachpb.Span{{Key: from, EndKey: to}})
+	return cq.add(readOnly, false /* preEvaluate */, zeroTS, prereqs, []roachpb.Span{{Key: from, EndKey: to}})
 }
 
 func getPrereqsAndAdd(cq *CommandQueue, from, to roachpb.Key, readOnly bool) ([]*cmd, *cmd) {
@@ -292,7 +292,7 @@ func TestCommandQueueWithoutCoveringOptimization(t *testing.T) {
 	c := roachpb.Span{Key: roachpb.Key("c")}
 
 	{
-		cmd := cq.add(false, zeroTS, nil, []roachpb.Span{a, b})
+		cmd := cq.add(false, false, zeroTS, nil, []roachpb.Span{a, b})
 		if !cmd.expanded {
 			t.Errorf("expected non-expanded command, not %+v", cmd)
 		}
@@ -306,7 +306,7 @@ func TestCommandQueueWithoutCoveringOptimization(t *testing.T) {
 	}
 
 	{
-		cmd := cq.add(false, zeroTS, nil, []roachpb.Span{c})
+		cmd := cq.add(false, false, zeroTS, nil, []roachpb.Span{c})
 		if cmd.expanded {
 			t.Errorf("expected unexpanded command, not %+v", cmd)
 		}
@@ -360,16 +360,16 @@ func TestCommandQueueIssue6495(t *testing.T) {
 	}
 
 	cq.getPrereqs(false, zeroTS, spans1998)
-	cmd1998 := cq.add(false, zeroTS, nil, spans1998)
+	cmd1998 := cq.add(false, false, zeroTS, nil, spans1998)
 
 	cq.getPrereqs(true, zeroTS, spans1999)
-	cmd1999 := cq.add(true, zeroTS, nil, spans1999)
+	cmd1999 := cq.add(true, false, zeroTS, nil, spans1999)
 
 	cq.getPrereqs(true, zeroTS, spans2002)
-	cq.add(true, zeroTS, nil, spans2002)
+	cq.add(true, false, zeroTS, nil, spans2002)
 
 	cq.getPrereqs(false, zeroTS, spans2003)
-	cq.add(false, zeroTS, nil, spans2003)
+	cq.add(false, false, zeroTS, nil, spans2003)
 
 	cq.remove(cmd1998)
 	cq.remove(cmd1999)
@@ -404,27 +404,27 @@ func TestCommandQueueTimestamps(t *testing.T) {
 		mkSpan("e", "g"),
 	}
 
-	cmd1 := cq.add(true, makeTS(1, 0), nil, spans1)
+	cmd1 := cq.add(true, false, makeTS(1, 0), nil, spans1)
 
 	pre2 := cq.getPrereqs(true, makeTS(2, 0), spans2)
 	if pre2 != nil {
 		t.Errorf("expected nil prereq slice; got %+v", pre2)
 	}
-	cmd2 := cq.add(false, makeTS(2, 0), pre2, spans2)
+	cmd2 := cq.add(false, false, makeTS(2, 0), pre2, spans2)
 
 	pre3 := cq.getPrereqs(true, makeTS(3, 0), spans3)
 	if pre3 != nil {
 		t.Errorf("expected nil prereq slice; got %+v", pre3)
 	}
-	cmd3 := cq.add(false, makeTS(3, 0), pre3, spans3)
+	cmd3 := cq.add(false, false, makeTS(3, 0), pre3, spans3)
 
 	// spans4 should wait on spans3.children[1].
 	pre4 := cq.getPrereqs(true, makeTS(4, 0), spans4)
 	expPre := []*cmd{&cmd3.children[1]}
 	if !reflect.DeepEqual(expPre, pre4) {
 		t.Errorf("expected prereq commands %+v; got %+v", expPre, pre4)
 	}
-	cmd4 := cq.add(true, makeTS(4, 0), pre4, spans4)
+	cmd4 := cq.add(true, false, makeTS(4, 0), pre4, spans4)
 
 	// Verify that an earlier writer for whole span waits on all commands.
 	pre5 := cq.getPrereqs(false, makeTS(0, 1), []roachpb.Span{mkSpan("a", "g")})
@@ -500,14 +500,14 @@ func TestCommandQueueEnclosedRead(t *testing.T) {
 	}
 
 	// Add command 1.
-	cmd1 := cq.add(true, makeTS(2, 0), nil, spans1)
+	cmd1 := cq.add(true, false, makeTS(2, 0), nil, spans1)
 
 	// Add command 2.
 	pre := cq.getPrereqs(false, makeTS(3, 0), spans2)
 	if expPre := []*cmd(nil); !reflect.DeepEqual(expPre, pre) {
 		t.Errorf("expected prereq commands %+v; got %+v", expPre, pre)
 	}
-	cmd2 := cq.add(false, makeTS(3, 0), pre, spans2)
+	cmd2 := cq.add(false, false, makeTS(3, 0), pre, spans2)
 
 	// Add command 3.
 	pre = cq.getPrereqs(false, makeTS(1, 0), spansCandidate)
@@ -540,14 +540,14 @@ func TestCommandQueueEnclosedWrite(t *testing.T) {
 	}
 
 	// Add command 1.
-	cmd1 := cq.add(false, makeTS(3, 0), nil, spans1)
+	cmd1 := cq.add(false, false, makeTS(3, 0), nil, spans1)
 
 	// Add command 2.
 	pre := cq.getPrereqs(true, makeTS(2, 0), spans2)
 	if expPre := []*cmd(nil); !reflect.DeepEqual(expPre, pre) {
 		t.Errorf("expected prereq commands %+v; got %+v", expPre, pre)
 	}
-	cmd2 := cq.add(true, makeTS(2, 0), nil, spans2)
+	cmd2 := cq.add(true, false, makeTS(2, 0), nil, spans2)
 
 	// Add command 3.
 	pre = cq.getPrereqs(false, makeTS(1, 0), spansCandidate)
@@ -576,10 +576,10 @@ func TestCommandQueueTimestampsEmpty(t *testing.T) {
 		mkSpan("h", ""),
 	}
 
-	cmd1 := cq.add(true, zeroTS, nil, spansR)
-	cmd2 := cq.add(false, zeroTS, nil, spansW)
-	cmd3 := cq.add(true, makeTS(1, 0), nil, spansRTS)
-	cmd4 := cq.add(false, makeTS(1, 0), nil, spansWTS)
+	cmd1 := cq.add(true, false, zeroTS, nil, spansR)
+	cmd2 := cq.add(false, false, zeroTS, nil, spansW)
+	cmd3 := cq.add(true, false, makeTS(1, 0), nil, spansRTS)
+	cmd4 := cq.add(false, false, makeTS(1, 0), nil, spansWTS)
 
 	// A writer will depend on both zero-timestamp spans.
 	pre := cq.getPrereqs(false, makeTS(1, 0), []roachpb.Span{mkSpan("a", "f")})
@@ -677,7 +677,7 @@ func TestCommandQueueTransitiveDependencies(t *testing.T) {
 				{
 					cq := NewCommandQueue(true)
 
-					cq.add(ops1.readOnly, ops1.ts, nil, ops1.spans)
+					cq.add(ops1.readOnly, false, ops1.ts, nil, ops1.spans)
 
 					pre3 := cq.getPrereqs(ops3.readOnly, ops3.ts, ops3.spans)
 					if expectDependency := len(pre3) > 0; !expectDependency {
@@ -697,12 +697,12 @@ func TestCommandQueueTransitiveDependencies(t *testing.T) {
 					cq := NewCommandQueue(true)
 
 					// Add command 1.
-					cmd1 := cq.add(ops1.readOnly, ops1.ts, nil, ops1.spans)
+					cmd1 := cq.add(ops1.readOnly, false, ops1.ts, nil, ops1.spans)
 
 					// Add command 2, taking note of whether it depends on command 1.
 					pre2 := cq.getPrereqs(ops2.readOnly, ops2.ts, ops2.spans)
 					dependency2to1 := len(pre2) > 0
-					cmd2 := cq.add(ops2.readOnly, ops2.ts, pre2, ops2.spans)
+					cmd2 := cq.add(ops2.readOnly, false, ops2.ts, pre2, ops2.spans)
 
 					// Add command 3, taking note of whether it depends on command 1
 					// or on command 2.
@@ -755,7 +755,7 @@ func TestCommandQueueGetSnapshotWithChildren(t *testing.T) {
 	cmd2 := add(cq, roachpb.Key("a"), nil, true, []*cmd{cmd1})
 	// the following creates a node with two children because it has two spans
 	// only the children show up in the snapshot.
-	cq.add(true, zeroTS, []*cmd{cmd2}, []roachpb.Span{
+	cq.add(true, false, zeroTS, []*cmd{cmd2}, []roachpb.Span{
 		{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")},
 		{Key: roachpb.Key("d"), EndKey: roachpb.Key("f")},
 	})
@@ -818,7 +818,7 @@ func BenchmarkCommandQueueGetPrereqsAllReadOnly(b *testing.B) {
 				EndKey: roachpb.Key("aaaaaaaaab"),
 			}}
 			for i := 0; i < size; i++ {
-				cq.add(true, zeroTS, nil, spans)
+				cq.add(true, false, zeroTS, nil, spans)
 			}
 
 			b.ResetTimer()
@@ -853,7 +853,7 @@ func BenchmarkCommandQueueReadWriteMix(b *testing.B) {
 					var cmd *cmd
 					readOnly := j%(readsPerWrite+1) != 0
 					prereqs := cq.getPrereqs(readOnly, zeroTS, spans)
-					cmd = cq.add(readOnly, zeroTS, prereqs, spans)
+					cmd = cq.add(readOnly, false, zeroTS, prereqs, spans)
 					if len(liveCmdQueue) == cap(liveCmdQueue) {
 						cq.remove(<-liveCmdQueue)
 					}