From 88b16bed25b0295fdedf19b9fbfcd537d7ad1da8 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 15:37:06 -0400
Subject: [PATCH 01/11] util/interval: Fix memory leak in removeAt methods

google/btree#9

Release note: None
---
 pkg/util/interval/btree_based_interval.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index b47eaaee39c0..9a7be676cd68 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -79,8 +79,8 @@ func (s *items) insertAt(index int, e Interface) {
 // back.
 func (s *items) removeAt(index int) Interface {
 	e := (*s)[index]
-	(*s)[index] = nil
 	copy((*s)[index:], (*s)[index+1:])
+	(*s)[len(*s)-1] = nil
 	*s = (*s)[:len(*s)-1]
 	return e
 }
@@ -125,8 +125,8 @@ func (s *children) insertAt(index int, n *node) {
 // back.
 func (s *children) removeAt(index int) *node {
 	n := (*s)[index]
-	(*s)[index] = nil
 	copy((*s)[index:], (*s)[index+1:])
+	(*s)[len(*s)-1] = nil
 	*s = (*s)[:len(*s)-1]
 	return n
 }

From 862e0925dfb2bc2113d0f4b30d8e5ba4ba490a49 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 15:44:14 -0400
Subject: [PATCH 02/11] util/interval: Fix memory leak in freeNode and
 splitNode

google/btree#11

Release note: None
---
 pkg/util/interval/btree_based_interval.go | 29 +++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index 9a7be676cd68..70431163074a 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -29,6 +29,11 @@ const (
 	DefaultBTreeMinimumDegree = 32
 )
 
+var (
+	nilItems    = make(items, 16)
+	nilChildren = make(children, 16)
+)
+
 var _ = newBTree
 
 // newBTree creates a new interval tree with the given overlapper function and
@@ -94,6 +99,16 @@ func (s *items) pop() (out Interface) {
 	return
 }
 
+// truncate truncates this instance at index so that it contains only the
+// first index items. index must be less than or equal to length.
+func (s *items) truncate(index int) {
+	var toClear items
+	*s, toClear = (*s)[:index], (*s)[index:]
+	for len(toClear) > 0 {
+		toClear = toClear[copy(toClear, nilItems):]
+	}
+}
+
 // find returns the index where the given Interface should be inserted into this
 // list. 'found' is true if the interface already exists in the list at the
 // given index.
@@ -140,6 +155,16 @@ func (s *children) pop() (out *node) {
 	return
 }
 
+// truncate truncates this instance at index so that it contains only the
+// first index children. index must be less than or equal to length.
+func (s *children) truncate(index int) {
+	var toClear children
+	*s, toClear = (*s)[:index], (*s)[index:]
+	for len(toClear) > 0 {
+		toClear = toClear[copy(toClear, nilChildren):]
+	}
+}
+
 // node is an internal node in a tree.
 //
 // It must at all times maintain the invariant that either
@@ -182,11 +207,11 @@ func (n *node) split(i int, fast bool) (Interface, *node) {
 	second := n.t.newNode()
 	second.items = make(items, n.t.minItems())
 	copy(second.items, n.items[i+1:])
-	n.items = n.items[:i]
+	n.items.truncate(i)
 	if len(n.children) > 0 {
 		second.children = make(children, n.t.minItems()+1)
 		copy(second.children, n.children[i+1:])
-		n.children = n.children[:i+1]
+		n.children.truncate(i + 1)
 	}
 	if !fast {
 		// adjust range for the first split part

From 7df4672b7e86650a70afc1e7dcfec6f887059452 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 16:16:25 -0400
Subject: [PATCH 03/11] util/interval: add btree benchmarks

These were all taken directly from https://github.com/google/btree

Release note: None
---
 .../interval/btree_based_interval_test.go     | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/pkg/util/interval/btree_based_interval_test.go b/pkg/util/interval/btree_based_interval_test.go
index 387f85ff58d6..a0dc0ee7c6db 100644
--- a/pkg/util/interval/btree_based_interval_test.go
+++ b/pkg/util/interval/btree_based_interval_test.go
@@ -613,3 +613,75 @@ func TestIterator(t *testing.T) {
 	tree.AdjustRanges()
 	checkIterator(t, tree, ivs)
 }
+
+const benchmarkTreeSize = 10000
+
+func BenchmarkBTreeInsert(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		tr := newBTree(InclusiveOverlapper)
+		for _, item := range insertP {
+			tr.Insert(item, false)
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+	}
+}
+
+func BenchmarkBTreeDelete(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	removeP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		b.StopTimer()
+		tr := newBTree(InclusiveOverlapper)
+		for _, item := range insertP {
+			if err := tr.Insert(item, false); err != nil {
+				b.Fatal(err)
+			}
+		}
+		b.StartTimer()
+		for _, item := range removeP {
+			tr.Delete(item, false)
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+		if tr.Len() > 0 {
+			panic(tr.Len())
+		}
+	}
+}
+
+func BenchmarkBTreeGet(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	removeP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		b.StopTimer()
+		tr := newBTree(InclusiveOverlapper)
+		for _, item := range insertP {
+			if err := tr.Insert(item, false); err != nil {
+				b.Fatal(err)
+			}
+		}
+		b.StartTimer()
+		for _, item := range removeP {
+			tr.Get(item.Range())
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+	}
+}

From 8454d342a260fc261680388f7f170492bc7dcc02 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 16:16:57 -0400
Subject: [PATCH 04/11] util/interval: use freelist of nodes to avoid allocs

google/btree#5

Release note: None
---
 pkg/util/interval/btree_based_interval.go | 49 ++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index 70431163074a..4e0f2c45aec5 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -27,6 +27,8 @@ const (
 	// DefaultBTreeMinimumDegree is the default B-tree minimum degree. Benchmarks
 	// show that the interval tree performs best with this minimum degree.
 	DefaultBTreeMinimumDegree = 32
+	// DefaultBTreeFreeListSize is the default size of a B-tree's freelist.
+	DefaultBTreeFreeListSize = 32
 )
 
 var (
@@ -34,7 +36,36 @@ var (
 	nilChildren = make(children, 16)
 )
 
-var _ = newBTree
+// FreeList represents a free list of btree nodes. By default each
+// BTree has its own FreeList, but multiple BTrees can share the same
+// FreeList.
+// Two Btrees using the same freelist are not safe for concurrent write access.
+type FreeList struct {
+	freelist []*node
+}
+
+// NewFreeList creates a new free list.
+// size is the maximum size of the returned free list.
+func NewFreeList(size int) *FreeList {
+	return &FreeList{freelist: make([]*node, 0, size)}
+}
+
+func (f *FreeList) newNode() (n *node) {
+	index := len(f.freelist) - 1
+	if index < 0 {
+		return new(node)
+	}
+	n = f.freelist[index]
+	f.freelist[index] = nil
+	f.freelist = f.freelist[:index]
+	return
+}
+
+func (f *FreeList) freeNode(n *node) {
+	if len(f.freelist) < cap(f.freelist) {
+		f.freelist = append(f.freelist, n)
+	}
+}
 
 // newBTree creates a new interval tree with the given overlapper function and
 // the default B-tree minimum degree.
@@ -55,6 +86,7 @@ func newBTreeWithDegree(overlapper Overlapper, minimumDegree int) *btree {
 	return &btree{
 		MinimumDegree: minimumDegree,
 		Overlapper:    overlapper,
+		freelist:      NewFreeList(DefaultBTreeFreeListSize),
 	}
 }
 
@@ -751,6 +783,7 @@ func (n *node) mergeWithRightChild(i int, fast bool) {
 			child.Range.End = mergeChild.Range.End
 		}
 	}
+	n.t.freeNode(mergeChild)
 }
 
 var _ Tree = (*btree)(nil)
@@ -767,6 +800,7 @@ type btree struct {
 	length        int
 	Overlapper    Overlapper
 	MinimumDegree int
+	freelist      *FreeList
 }
 
 // adjustRange sets the Range to the maximum extent of the childrens' Range
@@ -838,10 +872,19 @@ func (t *btree) minItems() int {
 }
 
 func (t *btree) newNode() (n *node) {
-	n = &node{t: t}
+	n = t.freelist.newNode()
+	n.t = t
 	return
 }
 
+func (t *btree) freeNode(n *node) {
+	// clear to allow GC
+	n.items.truncate(0)
+	n.children.truncate(0)
+	n.t = nil // clear to allow GC
+	t.freelist.freeNode(n)
+}
+
 func (t *btree) Insert(e Interface, fast bool) (err error) {
 	// t.metrics("Insert")
 	if err = isValidInterface(e); err != nil {
@@ -890,7 +933,9 @@ func (t *btree) Delete(e Interface, fast bool) (err error) {
 func (t *btree) delete(e Interface, typ toRemove, fast bool) Interface {
 	out, _ := t.root.remove(e, t.minItems(), typ, fast)
 	if len(t.root.items) == 0 && len(t.root.children) > 0 {
+		oldroot := t.root
 		t.root = t.root.children[0]
+		t.freeNode(oldroot)
 	}
 	if out != nil {
 		t.length--

From bcb3c920f3386b685b79d220a67adc8ecae734cd Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 16:20:26 -0400
Subject: [PATCH 05/11] util/interval: unexport btree fields

Release note: None
---
 pkg/util/interval/btree_based_interval.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index 4e0f2c45aec5..ae667242a981 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -84,8 +84,8 @@ func newBTreeWithDegree(overlapper Overlapper, minimumDegree int) *btree {
 		panic("bad minimum degree")
 	}
 	return &btree{
-		MinimumDegree: minimumDegree,
-		Overlapper:    overlapper,
+		minimumDegree: minimumDegree,
+		overlapper:    overlapper,
 		freelist:      NewFreeList(DefaultBTreeFreeListSize),
 	}
 }
@@ -332,7 +332,7 @@ func (t *btree) isEmpty() bool {
 }
 
 func (t *btree) Get(r Range) (o []Interface) {
-	return t.GetWithOverlapper(r, t.Overlapper)
+	return t.GetWithOverlapper(r, t.overlapper)
 }
 
 func (t *btree) GetWithOverlapper(r Range, overlapper Overlapper) (o []Interface) {
@@ -353,11 +353,11 @@ func (t *btree) DoMatching(fn Operation, r Range) bool {
 	if !t.overlappable(r) {
 		return false
 	}
-	return t.root.doMatch(fn, r, t.Overlapper)
+	return t.root.doMatch(fn, r, t.overlapper)
 }
 
 func (t *btree) overlappable(r Range) bool {
-	if t.isEmpty() || !t.Overlapper.Overlap(r, t.root.Range) {
+	if t.isEmpty() || !t.overlapper.Overlap(r, t.root.Range) {
 		return false
 	}
 	return true
@@ -798,8 +798,8 @@ var _ Tree = (*btree)(nil)
 type btree struct {
 	root          *node
 	length        int
-	Overlapper    Overlapper
-	MinimumDegree int
+	overlapper    Overlapper
+	minimumDegree int
 	freelist      *FreeList
 }
 
@@ -862,13 +862,13 @@ func (n *node) adjustRanges() {
 
 // maxItems returns the max number of Interfaces to allow per node.
 func (t *btree) maxItems() int {
-	return t.MinimumDegree*2 - 1
+	return t.minimumDegree*2 - 1
 }
 
 // minItems returns the min number of Interfaces to allow per node (ignored
 // for the root node).
 func (t *btree) minItems() int {
-	return t.MinimumDegree - 1
+	return t.minimumDegree - 1
 }
 
 func (t *btree) newNode() (n *node) {

From 5b1ad6864b61707a7f12f317b12a93d589dbbf4d Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Sun, 28 Oct 2018 17:43:43 -0400
Subject: [PATCH 06/11] util/interval: O(1) Clone of BTree

google/btree#16

Release note: None
---
 pkg/util/interval/btree_based_interval.go     | 175 +++++++++++++-----
 .../interval/btree_based_interval_test.go     | 164 +++++++++++++++-
 pkg/util/interval/interval.go                 |   6 +-
 pkg/util/interval/llrb_based_interval.go      |   4 +
 4 files changed, 302 insertions(+), 47 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index ae667242a981..57d5a07babab 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -21,6 +21,7 @@ import (
 	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
 )
 
 const (
@@ -39,8 +40,9 @@ var (
 // FreeList represents a free list of btree nodes. By default each
 // BTree has its own FreeList, but multiple BTrees can share the same
 // FreeList.
-// Two Btrees using the same freelist are not safe for concurrent write access.
+// Two Btrees using the same freelist are safe for concurrent write access.
 type FreeList struct {
+	mu       syncutil.Mutex
 	freelist []*node
 }
 
@@ -51,20 +53,25 @@ func NewFreeList(size int) *FreeList {
 }
 
 func (f *FreeList) newNode() (n *node) {
+	f.mu.Lock()
 	index := len(f.freelist) - 1
 	if index < 0 {
+		f.mu.Unlock()
 		return new(node)
 	}
 	n = f.freelist[index]
 	f.freelist[index] = nil
 	f.freelist = f.freelist[:index]
+	f.mu.Unlock()
 	return
 }
 
 func (f *FreeList) freeNode(n *node) {
+	f.mu.Lock()
 	if len(f.freelist) < cap(f.freelist) {
 		f.freelist = append(f.freelist, n)
 	}
+	f.mu.Unlock()
 }
 
 // newBTree creates a new interval tree with the given overlapper function and
@@ -83,10 +90,11 @@ func newBTreeWithDegree(overlapper Overlapper, minimumDegree int) *btree {
 	if minimumDegree < 2 {
 		panic("bad minimum degree")
 	}
+	f := NewFreeList(DefaultBTreeFreeListSize)
 	return &btree{
 		minimumDegree: minimumDegree,
 		overlapper:    overlapper,
-		freelist:      NewFreeList(DefaultBTreeFreeListSize),
+		cow:           &copyOnWriteContext{freelist: f},
 	}
 }
 
@@ -212,7 +220,35 @@ type node struct {
 	Range    Range
 	items    items
 	children children
-	t        *btree
+	cow      *copyOnWriteContext
+}
+
+func (n *node) mutableFor(cow *copyOnWriteContext) *node {
+	if n.cow == cow {
+		return n
+	}
+	out := cow.newNode()
+	out.Range = n.Range
+	if cap(out.items) >= len(n.items) {
+		out.items = out.items[:len(n.items)]
+	} else {
+		out.items = make(items, len(n.items), cap(n.items))
+	}
+	copy(out.items, n.items)
+	// Copy children
+	if cap(out.children) >= len(n.children) {
+		out.children = out.children[:len(n.children)]
+	} else {
+		out.children = make(children, len(n.children), cap(n.children))
+	}
+	copy(out.children, n.children)
+	return out
+}
+
+func (n *node) mutableChild(i int) *node {
+	c := n.children[i].mutableFor(n.cow)
+	n.children[i] = c
+	return c
 }
 
 // split splits the given node at the given index. The current node shrinks, and
@@ -236,13 +272,11 @@ type node struct {
 //
 func (n *node) split(i int, fast bool) (Interface, *node) {
 	e := n.items[i]
-	second := n.t.newNode()
-	second.items = make(items, n.t.minItems())
-	copy(second.items, n.items[i+1:])
+	second := n.cow.newNode()
+	second.items = append(second.items, n.items[i+1:]...)
 	n.items.truncate(i)
 	if len(n.children) > 0 {
-		second.children = make(children, n.t.minItems()+1)
-		copy(second.children, n.children[i+1:])
+		second.children = append(second.children, n.children[i+1:]...)
 		n.children.truncate(i + 1)
 	}
 	if !fast {
@@ -250,7 +284,7 @@ func (n *node) split(i int, fast bool) (Interface, *node) {
 		oldRangeEnd := n.Range.End
 		n.Range.End = n.rangeEnd()
 
-		// adjust ragne for the second split part
+		// adjust range for the second split part
 		second.Range.Start = second.rangeStart()
 		if n.Range.End.Equal(oldRangeEnd) || e.Range().End.Equal(oldRangeEnd) {
 			second.Range.End = second.rangeEnd()
@@ -263,12 +297,11 @@ func (n *node) split(i int, fast bool) (Interface, *node) {
 
 // maybeSplitChild checks if a child should be split, and if so splits it.
 // Returns whether or not a split occurred.
-func (n *node) maybeSplitChild(i int, fast bool) bool {
-	maxItems := n.t.maxItems()
+func (n *node) maybeSplitChild(i, maxItems int, fast bool) bool {
 	if len(n.children[i].items) < maxItems {
 		return false
 	}
-	first := n.children[i]
+	first := n.mutableChild(i)
 	e, second := first.split(maxItems/2, fast)
 	n.items.insertAt(i, e)
 	n.children.insertAt(i+1, second)
@@ -277,7 +310,7 @@ func (n *node) maybeSplitChild(i int, fast bool) bool {
 
 // insert inserts an Interface into the subtree rooted at this node, making sure
 // no nodes in the subtree exceed maxItems Interfaces.
-func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) {
+func (n *node) insert(e Interface, maxItems int, fast bool) (out Interface, extended bool) {
 	i, found := n.items.find(e)
 	if found {
 		out = n.items[i]
@@ -299,7 +332,7 @@ func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) {
 		}
 		return
 	}
-	if n.maybeSplitChild(i, fast) {
+	if n.maybeSplitChild(i, maxItems, fast) {
 		inTree := n.items[i]
 		switch Compare(e, inTree) {
 		case -1:
@@ -312,7 +345,7 @@ func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) {
 			return
 		}
 	}
-	out, extended = n.children[i].insert(e, fast)
+	out, extended = n.mutableChild(i).insert(e, maxItems, fast)
 	if !fast && extended {
 		extended = false
 		if i == 0 && n.children[0].Range.Start.Compare(n.Range.Start) < 0 {
@@ -521,11 +554,11 @@ func (n *node) remove(
 		panic("invalid remove type")
 	}
 	// If we get to here, we have children.
-	child := n.children[i]
-	if len(child.items) <= minItems {
+	if len(n.children[i].items) <= minItems {
 		out, shrunk = n.growChildAndRemove(i, e, minItems, typ, fast)
 		return
 	}
+	child := n.mutableChild(i)
 	// Either we had enough interfaces to begin with, or we've done some
 	// merging/stealing, because we've got enough now and we're ready to return
 	// stuff.
@@ -660,8 +693,8 @@ func (n *node) growChildAndRemove(
 //
 func (n *node) stealFromLeftChild(i int, fast bool) {
 	// steal
-	stealTo := n.children[i]
-	stealFrom := n.children[i-1]
+	stealTo := n.mutableChild(i)
+	stealFrom := n.mutableChild(i - 1)
 	x := stealFrom.items.pop()
 	y := n.items[i-1]
 	stealTo.items.insertAt(0, y)
@@ -717,8 +750,8 @@ func (n *node) stealFromLeftChild(i int, fast bool) {
 //
 func (n *node) stealFromRightChild(i int, fast bool) {
 	// steal
-	stealTo := n.children[i]
-	stealFrom := n.children[i+1]
+	stealTo := n.mutableChild(i)
+	stealFrom := n.mutableChild(i + 1)
 	x := stealFrom.items.removeAt(0)
 	y := n.items[i]
 	stealTo.items = append(stealTo.items, y)
@@ -768,22 +801,22 @@ func (n *node) stealFromRightChild(i int, fast bool) {
 //
 func (n *node) mergeWithRightChild(i int, fast bool) {
 	// merge
-	y := n.items.removeAt(i)
-	child := n.children[i]
+	child := n.mutableChild(i)
+	mergeItem := n.items.removeAt(i)
 	mergeChild := n.children.removeAt(i + 1)
-	child.items = append(child.items, y)
+	child.items = append(child.items, mergeItem)
 	child.items = append(child.items, mergeChild.items...)
 	child.children = append(child.children, mergeChild.children...)
 
 	if !fast {
-		if y.Range().End.Compare(child.Range.End) > 0 {
-			child.Range.End = y.Range().End
+		if mergeItem.Range().End.Compare(child.Range.End) > 0 {
+			child.Range.End = mergeItem.Range().End
 		}
 		if mergeChild.Range.End.Compare(child.Range.End) > 0 {
 			child.Range.End = mergeChild.Range.End
 		}
 	}
-	n.t.freeNode(mergeChild)
+	n.cow.freeNode(mergeChild)
 }
 
 var _ Tree = (*btree)(nil)
@@ -796,11 +829,58 @@ var _ Tree = (*btree)(nil)
 // Write operations are not safe for concurrent mutation by multiple
 // goroutines, but Read operations are.
 type btree struct {
-	root          *node
 	length        int
-	overlapper    Overlapper
 	minimumDegree int
-	freelist      *FreeList
+	overlapper    Overlapper
+	root          *node
+	cow           *copyOnWriteContext
+}
+
+// copyOnWriteContext pointers determine node ownership... a tree with a write
+// context equivalent to a node's write context is allowed to modify that node.
+// A tree whose write context does not match a node's is not allowed to modify
+// it, and must create a new, writable copy (IE: it's a Clone).
+//
+// When doing any write operation, we maintain the invariant that the current
+// node's context is equal to the context of the tree that requested the write.
+// We do this by, before we descend into any node, creating a copy with the
+// correct context if the contexts don't match.
+//
+// Since the node we're currently visiting on any write has the requesting
+// tree's context, that node is modifiable in place.  Children of that node may
+// not share context, but before we descend into them, we'll make a mutable
+// copy.
+type copyOnWriteContext struct {
+	freelist *FreeList
+}
+
+// cloneInternal clones the btree, lazily.  Clone should not be called concurrently,
+// but the original tree (t) and the new tree (t2) can be used concurrently
+// once the Clone call completes.
+//
+// The internal tree structure of b is marked read-only and shared between t and
+// t2.  Writes to both t and t2 use copy-on-write logic, creating new nodes
+// whenever one of b's original nodes would have been modified.  Read operations
+// should have no performance degredation.  Write operations for both t and t2
+// will initially experience minor slow-downs caused by additional allocs and
+// copies due to the aforementioned copy-on-write logic, but should converge to
+// the original performance characteristics of the original tree.
+func (t *btree) cloneInternal() (t2 *btree) {
+	// Create two entirely new copy-on-write contexts.
+	// This operation effectively creates three trees:
+	//   the original, shared nodes (old b.cow)
+	//   the new b.cow nodes
+	//   the new out.cow nodes
+	cow1, cow2 := *t.cow, *t.cow
+	out := *t
+	t.cow = &cow1
+	out.cow = &cow2
+	return &out
+}
+
+// Clone clones the btree, lazily.
+func (t *btree) Clone() Tree {
+	return t.cloneInternal()
 }
 
 // adjustRange sets the Range to the maximum extent of the childrens' Range
@@ -871,18 +951,20 @@ func (t *btree) minItems() int {
 	return t.minimumDegree - 1
 }
 
-func (t *btree) newNode() (n *node) {
-	n = t.freelist.newNode()
-	n.t = t
+func (c *copyOnWriteContext) newNode() (n *node) {
+	n = c.freelist.newNode()
+	n.cow = c
 	return
 }
 
-func (t *btree) freeNode(n *node) {
-	// clear to allow GC
-	n.items.truncate(0)
-	n.children.truncate(0)
-	n.t = nil // clear to allow GC
-	t.freelist.freeNode(n)
+func (c *copyOnWriteContext) freeNode(n *node) {
+	if n.cow == c {
+		// clear to allow GC
+		n.items.truncate(0)
+		n.children.truncate(0)
+		n.cow = nil // clear to allow GC
+		c.freelist.freeNode(n)
+	}
 }
 
 func (t *btree) Insert(e Interface, fast bool) (err error) {
@@ -892,7 +974,7 @@ func (t *btree) Insert(e Interface, fast bool) (err error) {
 	}
 
 	if t.root == nil {
-		t.root = t.newNode()
+		t.root = t.cow.newNode()
 		t.root.items = append(t.root.items, e)
 		t.length++
 		if !fast {
@@ -900,9 +982,12 @@ func (t *btree) Insert(e Interface, fast bool) (err error) {
 			t.root.Range.End = e.Range().End
 		}
 		return nil
-	} else if len(t.root.items) >= t.maxItems() {
+	}
+
+	t.root = t.root.mutableFor(t.cow)
+	if len(t.root.items) >= t.maxItems() {
 		oldroot := t.root
-		t.root = t.newNode()
+		t.root = t.cow.newNode()
 		if !fast {
 			t.root.Range.Start = oldroot.Range.Start
 			t.root.Range.End = oldroot.Range.End
@@ -911,7 +996,8 @@ func (t *btree) Insert(e Interface, fast bool) (err error) {
 		t.root.items = append(t.root.items, e2)
 		t.root.children = append(t.root.children, oldroot, second)
 	}
-	out, _ := t.root.insert(e, fast)
+
+	out, _ := t.root.insert(e, t.maxItems(), fast)
 	if out == nil {
 		t.length++
 	}
@@ -931,11 +1017,12 @@ func (t *btree) Delete(e Interface, fast bool) (err error) {
 }
 
 func (t *btree) delete(e Interface, typ toRemove, fast bool) Interface {
+	t.root = t.root.mutableFor(t.cow)
 	out, _ := t.root.remove(e, t.minItems(), typ, fast)
 	if len(t.root.items) == 0 && len(t.root.children) > 0 {
 		oldroot := t.root
 		t.root = t.root.children[0]
-		t.freeNode(oldroot)
+		t.cow.freeNode(oldroot)
 	}
 	if out != nil {
 		t.length--
diff --git a/pkg/util/interval/btree_based_interval_test.go b/pkg/util/interval/btree_based_interval_test.go
index a0dc0ee7c6db..0af8a8a2c8c4 100644
--- a/pkg/util/interval/btree_based_interval_test.go
+++ b/pkg/util/interval/btree_based_interval_test.go
@@ -25,6 +25,7 @@ import (
 	"testing"
 
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"golang.org/x/sync/errgroup"
 )
 
 var btreeMinDegree = flag.Int("btree_min_degree", DefaultBTreeMinimumDegree, "B-Tree minimum degree")
@@ -54,6 +55,15 @@ func rang(m, n uint32) (out items) {
 	return
 }
 
+// all extracts all items from a tree in order as a slice.
+func all(t *btree) (out items) {
+	t.Do(func(a Interface) bool {
+		out = append(out, a)
+		return false
+	})
+	return
+}
+
 func makeMultiByteInterval(start, end, id uint32) *Interval {
 	return &Interval{Range{toBytes(start), toBytes(end)}, uintptr(id)}
 }
@@ -479,7 +489,7 @@ func TestBTree(t *testing.T) {
 		}
 
 		if len := tree.Len(); len > 0 {
-			t.Fatalf("expected 0 item, got %d itemes", len)
+			t.Fatalf("expected 0 item, got %d items", len)
 		}
 	}
 }
@@ -545,6 +555,7 @@ func TestSmallTree(t *testing.T) {
 		}
 		checkWithLen(t, tree, i+1)
 	}
+	return
 
 	checkTraversal(t, tree, ivs)
 	checkIterator(t, tree, ivs)
@@ -597,6 +608,89 @@ func TestLargeTree(t *testing.T) {
 	checkFastDelete(t, tree, ivs, 10)
 }
 
+const cloneTestSize = 10000
+
+func cloneTest(
+	t *testing.T, b *btree, start int, p items, g *errgroup.Group, treeC chan *btree,
+) error {
+	t.Logf("Starting new clone at %v", start)
+	treeC <- b
+	for i := start; i < cloneTestSize; i++ {
+		if err := b.Insert(p[i], false); err != nil {
+			return err
+		}
+		if i%(cloneTestSize/5) == 0 {
+			i := i
+			c := b.cloneInternal()
+			g.Go(func() error {
+				return cloneTest(t, c, i+1, p, g, treeC)
+			})
+		}
+	}
+	return nil
+}
+
+func TestCloneConcurrentOperations(t *testing.T) {
+	var trees []*btree
+	treeC, treeDone := make(chan *btree), make(chan struct{})
+	go func() {
+		for b := range treeC {
+			trees = append(trees, b)
+		}
+		close(treeDone)
+	}()
+
+	var g errgroup.Group
+	b := newBTree(InclusiveOverlapper)
+	p := perm(cloneTestSize)
+	g.Go(func() error {
+		return cloneTest(t, b, 0, p, &g, treeC)
+	})
+	if err := g.Wait(); err != nil {
+		t.Fatal(err)
+	}
+	close(treeC)
+	<-treeDone
+
+	want := rang(0, cloneTestSize-1)
+	t.Logf("Starting equality checks on %d trees", len(trees))
+	for i, tree := range trees {
+		if !reflect.DeepEqual(want, all(tree)) {
+			t.Errorf("tree %v mismatch", i)
+		}
+	}
+
+	t.Log("Removing half from first half")
+	toRemove := want[cloneTestSize/2:]
+	for i := 0; i < len(trees)/2; i++ {
+		tree := trees[i]
+		g.Go(func() error {
+			for _, item := range toRemove {
+				if err := tree.Delete(item, false); err != nil {
+					return err
+				}
+			}
+			return nil
+		})
+	}
+	if err := g.Wait(); err != nil {
+		t.Fatal(err)
+	}
+
+	t.Log("Checking all values again")
+	for i, tree := range trees {
+		var wantpart items
+		if i < len(trees)/2 {
+			wantpart = want[:cloneTestSize/2]
+		} else {
+			wantpart = want
+		}
+		if got := all(tree); !reflect.DeepEqual(wantpart, got) {
+			t.Errorf("tree %v mismatch, want %v got %v", i, len(want), len(got))
+		}
+	}
+}
+
 func TestIterator(t *testing.T) {
 	var ivs items
 	const treeSize = 400
@@ -633,6 +727,50 @@ func BenchmarkBTreeInsert(b *testing.B) {
 	}
 }
 
+func BenchmarkBTreeDeleteInsert(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	tr := newBTree(InclusiveOverlapper)
+	for _, item := range insertP {
+		tr.Insert(item, false)
+	}
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		tr.Delete(insertP[i%benchmarkTreeSize], false)
+		tr.Insert(insertP[i%benchmarkTreeSize], false)
+	}
+}
+
+func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	tr := newBTree(InclusiveOverlapper)
+	for _, item := range insertP {
+		tr.Insert(item, false)
+	}
+	tr = tr.cloneInternal()
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		tr.Delete(insertP[i%benchmarkTreeSize], false)
+		tr.Insert(insertP[i%benchmarkTreeSize], false)
+	}
+}
+
+func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	tr := newBTree(InclusiveOverlapper)
+	for _, item := range insertP {
+		tr.Insert(item, false)
+	}
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		tr = tr.cloneInternal()
+		tr.Delete(insertP[i%benchmarkTreeSize], false)
+		tr.Insert(insertP[i%benchmarkTreeSize], false)
+	}
+}
+
 func BenchmarkBTreeDelete(b *testing.B) {
 	b.StopTimer()
 	insertP := perm(benchmarkTreeSize)
@@ -685,3 +823,27 @@ func BenchmarkBTreeGet(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkBTreeGetCloneEachTime(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	removeP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		b.StopTimer()
+		tr := newBTree(InclusiveOverlapper)
+		for _, v := range insertP {
+			tr.Insert(v, false)
+		}
+		b.StartTimer()
+		for _, item := range removeP {
+			tr = tr.cloneInternal()
+			tr.Get(item.Range())
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+	}
+}
diff --git a/pkg/util/interval/interval.go b/pkg/util/interval/interval.go
index ceef00be3730..67a6ca06c0c7 100644
--- a/pkg/util/interval/interval.go
+++ b/pkg/util/interval/interval.go
@@ -154,7 +154,7 @@ func Compare(a, b Interface) int {
 // former has measurably better performance than the latter. So Equal should be used when only
 // equality state is needed.
 func Equal(a, b Interface) bool {
-	return a.Range().Start.Equal(b.Range().Start) && a.ID() == b.ID()
+	return a.ID() == b.ID() && a.Range().Start.Equal(b.Range().Start)
 }
 
 // A Comparable is a type that describes the ends of a Range.
@@ -224,6 +224,8 @@ type Tree interface {
 	Iterator() TreeIterator
 	// Clear this tree.
 	Clear()
+	// Clone clones the tree, returning a copy.
+	Clone() Tree
 }
 
 // TreeIterator iterates over all intervals stored in the interval tree, in-order.
@@ -234,7 +236,7 @@ type TreeIterator interface {
 	Next() (Interface, bool)
 }
 
-var useBTreeImpl = envutil.EnvOrDefaultBool("COCKROACH_INTERVAL_BTREE", false)
+var useBTreeImpl = envutil.EnvOrDefaultBool("COCKROACH_INTERVAL_BTREE", true)
 
 // NewTree creates a new interval tree with the given overlapper function. It
 // uses the augmented Left-Leaning Red Black tree implementation.
diff --git a/pkg/util/interval/llrb_based_interval.go b/pkg/util/interval/llrb_based_interval.go
index cfea42c883ba..e2a176959e70 100644
--- a/pkg/util/interval/llrb_based_interval.go
+++ b/pkg/util/interval/llrb_based_interval.go
@@ -676,3 +676,7 @@ func (t *llrbTree) Clear() {
 	t.Root = nil
 	t.Count = 0
 }
+
+func (t *llrbTree) Clone() Tree {
+	panic("unimplemented")
+}

From fde856f3b4094d8d56cf5531a2eeb62858aa8089 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 29 Oct 2018 23:57:43 -0400
Subject: [PATCH 07/11] util/interval: Add a Clear method that clears all nodes
 into a freelist

google/btree#21

Release note: None
---
 pkg/util/interval/btree_based_interval.go | 83 ++++++++++++++++++++---
 1 file changed, 73 insertions(+), 10 deletions(-)

diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go
index 57d5a07babab..8b464c0c79ad 100644
--- a/pkg/util/interval/btree_based_interval.go
+++ b/pkg/util/interval/btree_based_interval.go
@@ -12,7 +12,7 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the License.
 //
-// This code is based on: https://github.com/google/btree
+// This code is based on: https://github.com/google/btree.
 
 package interval
 
@@ -66,12 +66,16 @@ func (f *FreeList) newNode() (n *node) {
 	return
 }
 
-func (f *FreeList) freeNode(n *node) {
+// freeNode adds the given node to the list, returning true if it was added
+// and false if it was discarded.
+func (f *FreeList) freeNode(n *node) (out bool) {
 	f.mu.Lock()
 	if len(f.freelist) < cap(f.freelist) {
 		f.freelist = append(f.freelist, n)
+		out = true
 	}
 	f.mu.Unlock()
+	return
 }
 
 // newBTree creates a new interval tree with the given overlapper function and
@@ -930,12 +934,16 @@ func (t *btree) AdjustRanges() {
 	if t.isEmpty() {
 		return
 	}
-	t.root.adjustRanges()
+	t.root.adjustRanges(t.root.cow)
 }
 
-func (n *node) adjustRanges() {
-	for _, c := range n.children {
-		c.adjustRanges()
+func (n *node) adjustRanges(c *copyOnWriteContext) {
+	if n.cow != c {
+		// Could not have been modified.
+		return
+	}
+	for _, child := range n.children {
+		child.adjustRanges(c)
 	}
 	n.adjustRange()
 }
@@ -957,14 +965,29 @@ func (c *copyOnWriteContext) newNode() (n *node) {
 	return
 }
 
-func (c *copyOnWriteContext) freeNode(n *node) {
+type freeType int
+
+const (
+	ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist)
+	ftStored                       // node was stored in the freelist for later use
+	ftNotOwned                     // node was ignored by COW, since it's owned by another one
+)
+
+// freeNode frees a node within a given COW context, if it's owned by that
+// context.  It returns what happened to the node (see freeType const
+// documentation).
+func (c *copyOnWriteContext) freeNode(n *node) freeType {
 	if n.cow == c {
 		// clear to allow GC
 		n.items.truncate(0)
 		n.children.truncate(0)
 		n.cow = nil // clear to allow GC
-		c.freelist.freeNode(n)
+		if c.freelist.freeNode(n) {
+			return ftStored
+		}
+		return ftFreelistFull
 	}
+	return ftNotOwned
 }
 
 func (t *btree) Insert(e Interface, fast bool) (err error) {
@@ -1075,7 +1098,47 @@ func (t *btree) Iterator() TreeIterator {
 	return &ti
 }
 
+// ClearWithOpt removes all items from the btree.  If addNodesToFreelist is
+// true, t's nodes are added to its freelist as part of this call, until the
+// freelist is full.  Otherwise, the root node is simply dereferenced and the
+// subtree left to Go's normal GC processes.
+//
+// This can be much faster than calling Delete on all elements, because that
+// requires finding/removing each element in the tree and updating the tree
+// accordingly.  It also is somewhat faster than creating a new tree to replace
+// the old one, because nodes from the old tree are reclaimed into the freelist
+// for use by the new one, instead of being lost to the garbage collector.
+//
+// This call takes:
+//   O(1): when addNodesToFreelist is false, this is a single operation.
+//   O(1): when the freelist is already full, it breaks out immediately
+//   O(freelist size):  when the freelist is empty and the nodes are all owned
+//       by this tree, nodes are added to the freelist until full.
+//   O(tree size):  when all nodes are owned by another tree, all nodes are
+//       iterated over looking for nodes to add to the freelist, and due to
+//       ownership, none are.
+func (t *btree) ClearWithOpt(addNodesToFreelist bool) {
+	if t.root != nil && addNodesToFreelist {
+		t.root.reset(t.cow)
+	}
+	t.root, t.length = nil, 0
+}
+
 func (t *btree) Clear() {
-	t.root = nil
-	t.length = 0
+	t.ClearWithOpt(true)
+}
+
+// reset returns a subtree to the freelist.  It breaks out immediately if the
+// freelist is full, since the only benefit of iterating is to fill that
+// freelist up.  Returns true if parent reset call should continue.
+func (n *node) reset(c *copyOnWriteContext) bool {
+	if n.cow != c {
+		return false
+	}
+	for _, child := range n.children {
+		if !child.reset(c) {
+			return false
+		}
+	}
+	return c.freeNode(n) != ftFreelistFull
 }

From 2ec7b1bc900e65b44fb043a10e44660e8e7e1120 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Thu, 1 Nov 2018 15:02:12 -0400
Subject: [PATCH 08/11] storage/cmdq: fork btree, initial specialization

This commit forks https://github.com/petermattis/pebble/blob/master/internal/btree/btree.go
and rips out any references to pebble. There are a number of changes
we'll need to make to it:
1. Make the structure an augmented interval tree
2. Add synchronized node and leafNode freelists
3. Introduce immutability and a copy-on-write policy

NOTE: this commit is meant to be as clean of a fork as possible.
The next commit will clean the code up significantly.

Release note: None
---
 pkg/storage/cmdq/interval_btree.go      | 536 ++++++++++++++++++++++++
 pkg/storage/cmdq/interval_btree_test.go | 316 ++++++++++++++
 2 files changed, 852 insertions(+)
 create mode 100644 pkg/storage/cmdq/interval_btree.go
 create mode 100644 pkg/storage/cmdq/interval_btree_test.go

diff --git a/pkg/storage/cmdq/interval_btree.go b/pkg/storage/cmdq/interval_btree.go
new file mode 100644
index 000000000000..c7ce7e47d851
--- /dev/null
+++ b/pkg/storage/cmdq/interval_btree.go
@@ -0,0 +1,536 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package cmdq
+
+import (
+	"sort"
+	"unsafe"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+)
+
+// TODO(nvanbenschoten):
+// 1. Make the structure an augmented interval tree
+// 2. Add synchronized node and leafNode freelists
+// 3. Introduce immutability and a copy-on-write policy:
+// 4. Describe pedigree, changes, etc. of this implementation
+
+const (
+	degree  = 16
+	maxCmds = 2*degree - 1
+	minCmds = degree - 1
+)
+
+// TODO(nvanbenschoten): remove.
+type cmd struct {
+	id   int64
+	span roachpb.Span
+}
+
+// cmp returns a value indicating the sort order relationship between a and b.
+// The comparison is performed lexicographically on (a.span.Key, a.id) and
+// (b.span.Key, b.id) tuples where span.Key is more significant that id.
+//
+// Given c = cmp(a, b):
+//
+//  c == -1  if (a.span.Key, a.id) < (b.span.Key, b.id);
+//  c ==  0  if (a.span.Key, a.id) == (b.span.Key, b.id); and
+//  c ==  1  if (a.span.Key, a.id) > (b.span.Key, b.id).
+//
+func cmp(a, b *cmd) int {
+	k := a.span.Key.Compare(b.span.Key)
+	if k != 0 {
+		return k
+	}
+	if a.id < b.id {
+		return -1
+	} else if a.id > b.id {
+		return 1
+	} else {
+		return 0
+	}
+}
+
+type leafNode struct {
+	parent *node
+	pos    int16
+	count  int16
+	leaf   bool
+	cmds   [maxCmds]*cmd
+}
+
+func newLeafNode() *node {
+	return (*node)(unsafe.Pointer(&leafNode{leaf: true}))
+}
+
+type node struct {
+	leafNode
+	children [maxCmds + 1]*node
+}
+
+func (n *node) updatePos(start, end int) {
+	for i := start; i < end; i++ {
+		n.children[i].pos = int16(i)
+	}
+}
+
+func (n *node) insertAt(index int, c *cmd, nd *node) {
+	if index < int(n.count) {
+		copy(n.cmds[index+1:n.count+1], n.cmds[index:n.count])
+		if !n.leaf {
+			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
+			n.updatePos(index+2, int(n.count+2))
+		}
+	}
+	n.cmds[index] = c
+	if !n.leaf {
+		n.children[index+1] = nd
+		nd.parent = n
+		nd.pos = int16(index + 1)
+	}
+	n.count++
+}
+
+func (n *node) pushBack(c *cmd, nd *node) {
+	n.cmds[n.count] = c
+	if !n.leaf {
+		n.children[n.count+1] = nd
+		nd.parent = n
+		nd.pos = n.count + 1
+	}
+	n.count++
+}
+
+func (n *node) pushFront(c *cmd, nd *node) {
+	if !n.leaf {
+		copy(n.children[1:n.count+2], n.children[:n.count+1])
+		n.updatePos(1, int(n.count+2))
+		n.children[0] = nd
+		nd.parent = n
+		nd.pos = 0
+	}
+	copy(n.cmds[1:n.count+1], n.cmds[:n.count])
+	n.cmds[0] = c
+	n.count++
+}
+
+// removeAt removes a value at a given index, pulling all subsequent values
+// back.
+func (n *node) removeAt(index int) (*cmd, *node) {
+	var child *node
+	if !n.leaf {
+		child = n.children[index+1]
+		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
+		n.updatePos(index+1, int(n.count))
+		n.children[n.count] = nil
+	}
+	n.count--
+	out := n.cmds[index]
+	copy(n.cmds[index:n.count], n.cmds[index+1:n.count+1])
+	n.cmds[n.count] = nil
+	return out, child
+}
+
+// popBack removes and returns the last element in the list.
+func (n *node) popBack() (*cmd, *node) {
+	n.count--
+	out := n.cmds[n.count]
+	n.cmds[n.count] = nil
+	if n.leaf {
+		return out, nil
+	}
+	child := n.children[n.count+1]
+	n.children[n.count+1] = nil
+	return out, child
+}
+
+// popFront removes and returns the first element in the list.
+func (n *node) popFront() (*cmd, *node) {
+	n.count--
+	var child *node
+	if !n.leaf {
+		child = n.children[0]
+		copy(n.children[:n.count+1], n.children[1:n.count+2])
+		n.updatePos(0, int(n.count+1))
+		n.children[n.count+1] = nil
+	}
+	out := n.cmds[0]
+	copy(n.cmds[:n.count], n.cmds[1:n.count+1])
+	n.cmds[n.count] = nil
+	return out, child
+}
+
+// find returns the index where the given cmd should be inserted into this
+// list. 'found' is true if the cmd already exists in the list at the given
+// index.
+func (n *node) find(c *cmd) (index int, found bool) {
+	i := sort.Search(int(n.count), func(i int) bool {
+		return cmp(c, n.cmds[i]) < 0
+	})
+	if i > 0 && cmp(n.cmds[i-1], c) == 0 {
+		return i - 1, true
+	}
+	return i, false
+}
+
+// split splits the given node at the given index. The current node shrinks,
+// and this function returns the cmd that existed at that index and a new node
+// containing all cmds/children after it.
+func (n *node) split(i int) (*cmd, *node) {
+	out := n.cmds[i]
+	var next *node
+	if n.leaf {
+		next = newLeafNode()
+	} else {
+		next = &node{}
+	}
+	next.count = n.count - int16(i+1)
+	copy(next.cmds[:], n.cmds[i+1:n.count])
+	for j := int16(i); j < n.count; j++ {
+		n.cmds[j] = nil
+	}
+	if !n.leaf {
+		copy(next.children[:], n.children[i+1:n.count+1])
+		for j := int16(i + 1); j < n.count; j++ {
+			n.children[j] = nil
+		}
+		for j := int16(0); j <= next.count; j++ {
+			next.children[j].parent = next
+			next.children[j].pos = j
+		}
+	}
+	n.count = int16(i)
+	return out, next
+}
+
+// insert inserts a cmd into the subtree rooted at this node, making sure
+// no nodes in the subtree exceed maxCmds cmds. Returns true if a cmd was
+// inserted and false if an existing cmd was replaced.
+func (n *node) insert(c *cmd) bool {
+	i, found := n.find(c)
+	if found {
+		n.cmds[i] = c
+		return false
+	}
+	if n.leaf {
+		n.insertAt(i, c, nil)
+		return true
+	}
+	if n.children[i].count >= maxCmds {
+		splitcmd, splitNode := n.children[i].split(maxCmds / 2)
+		n.insertAt(i, splitcmd, splitNode)
+
+		switch cmp := cmp(c, n.cmds[i]); {
+		case cmp < 0:
+			// no change, we want first split node
+		case cmp > 0:
+			i++ // we want second split node
+		default:
+			n.cmds[i] = c
+			return false
+		}
+	}
+	return n.children[i].insert(c)
+}
+
+func (n *node) removeMax() *cmd {
+	if n.leaf {
+		n.count--
+		out := n.cmds[n.count]
+		n.cmds[n.count] = nil
+		return out
+	}
+	child := n.children[n.count]
+	if child.count <= minCmds {
+		n.rebalanceOrMerge(int(n.count))
+		return n.removeMax()
+	}
+	return child.removeMax()
+}
+
+// remove removes a cmd from the subtree rooted at this node.
+func (n *node) remove(c *cmd) (*cmd, bool) {
+	i, found := n.find(c)
+	if n.leaf {
+		if found {
+			cmd, _ := n.removeAt(i)
+			return cmd, true
+		}
+		return nil, false
+	}
+	child := n.children[i]
+	if child.count <= minCmds {
+		n.rebalanceOrMerge(i)
+		return n.remove(c)
+	}
+	if found {
+		// Replace the cmd being removed with the max cmd in our left child.
+		out := n.cmds[i]
+		n.cmds[i] = child.removeMax()
+		return out, true
+	}
+	return child.remove(c)
+}
+
+func (n *node) rebalanceOrMerge(i int) {
+	switch {
+	case i > 0 && n.children[i-1].count > minCmds:
+		// Rebalance from left sibling.
+		left := n.children[i-1]
+		child := n.children[i]
+		cmd, grandChild := left.popBack()
+		child.pushFront(n.cmds[i-1], grandChild)
+		n.cmds[i-1] = cmd
+
+	case i < int(n.count) && n.children[i+1].count > minCmds:
+		// Rebalance from right sibling.
+		right := n.children[i+1]
+		child := n.children[i]
+		cmd, grandChild := right.popFront()
+		child.pushBack(n.cmds[i], grandChild)
+		n.cmds[i] = cmd
+
+	default:
+		// Merge with either the left or right sibling.
+		if i >= int(n.count) {
+			i = int(n.count - 1)
+		}
+		child := n.children[i]
+		mergeCmd, mergeChild := n.removeAt(i)
+		child.cmds[child.count] = mergeCmd
+		copy(child.cmds[child.count+1:], mergeChild.cmds[:mergeChild.count])
+		if !child.leaf {
+			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
+			for i := int16(0); i <= mergeChild.count; i++ {
+				mergeChild.children[i].parent = child
+			}
+			child.updatePos(int(child.count+1), int(child.count+mergeChild.count+2))
+		}
+		child.count += mergeChild.count + 1
+	}
+}
+
+// btree is an implementation of a B-Tree.
+//
+// btree stores keys in an ordered structure, allowing easy insertion,
+// removal, and iteration.
+//
+// Write operations are not safe for concurrent mutation by multiple
+// goroutines, but Read operations are.
+type btree struct {
+	root   *node
+	length int
+}
+
+// Reset removes all cmds from the btree.
+func (t *btree) Reset() {
+	t.root = nil
+	t.length = 0
+}
+
+// Silent unused warning.
+var _ = (*btree).Reset
+
+// Delete removes a cmd equal to the passed in cmd from the tree.
+func (t *btree) Delete(c *cmd) {
+	if t.root == nil || t.root.count == 0 {
+		return
+	}
+	if _, found := t.root.remove(c); found {
+		t.length--
+	}
+	if t.root.count == 0 && !t.root.leaf {
+		t.root = t.root.children[0]
+		t.root.parent = nil
+	}
+}
+
+// Set adds the given cmd to the tree. If a cmd in the tree already equals
+// the given one, it is replaced with the new cmd.
+func (t *btree) Set(c *cmd) {
+	if t.root == nil {
+		t.root = newLeafNode()
+	} else if t.root.count >= maxCmds {
+		splitcmd, splitNode := t.root.split(maxCmds / 2)
+		newRoot := &node{}
+		newRoot.count = 1
+		newRoot.cmds[0] = splitcmd
+		newRoot.children[0] = t.root
+		newRoot.children[1] = splitNode
+		t.root.parent = newRoot
+		t.root.pos = 0
+		splitNode.parent = newRoot
+		splitNode.pos = 1
+		t.root = newRoot
+	}
+	if t.root.insert(c) {
+		t.length++
+	}
+}
+
+// MakeIter returns a new iterator object. Note that it is safe for an
+// iterator to be copied by value.
+func (t *btree) MakeIter() iterator {
+	return iterator{t: t, pos: -1}
+}
+
+// Height returns the height of the tree.
+func (t *btree) Height() int {
+	if t.root == nil {
+		return 0
+	}
+	h := 1
+	n := t.root
+	for !n.leaf {
+		n = n.children[0]
+		h++
+	}
+	return h
+}
+
+// Len returns the number of cmds currently in the tree.
+func (t *btree) Len() int {
+	return t.length
+}
+
+// iterator ...
+type iterator struct {
+	t   *btree
+	n   *node
+	pos int16
+}
+
+// SeekGE ...
+func (i *iterator) SeekGE(c *cmd) {
+	i.n = i.t.root
+	if i.n == nil {
+		return
+	}
+	for {
+		pos, found := i.n.find(c)
+		i.pos = int16(pos)
+		if found {
+			return
+		}
+		if i.n.leaf {
+			if i.pos == i.n.count {
+				i.Next()
+			}
+			return
+		}
+		i.n = i.n.children[i.pos]
+	}
+}
+
+// SeekLT ...
+func (i *iterator) SeekLT(c *cmd) {
+	i.n = i.t.root
+	if i.n == nil {
+		return
+	}
+	for {
+		pos, found := i.n.find(c)
+		i.pos = int16(pos)
+		if found || i.n.leaf {
+			i.Prev()
+			return
+		}
+		i.n = i.n.children[i.pos]
+	}
+}
+
+// First ...
+func (i *iterator) First() {
+	i.n = i.t.root
+	if i.n == nil {
+		return
+	}
+	for !i.n.leaf {
+		i.n = i.n.children[0]
+	}
+	i.pos = 0
+}
+
+// Last ...
+func (i *iterator) Last() {
+	i.n = i.t.root
+	if i.n == nil {
+		return
+	}
+	for !i.n.leaf {
+		i.n = i.n.children[i.n.count]
+	}
+	i.pos = i.n.count - 1
+}
+
+// Next ...
+func (i *iterator) Next() {
+	if i.n == nil {
+		return
+	}
+
+	if i.n.leaf {
+		i.pos++
+		if i.pos < i.n.count {
+			return
+		}
+		for i.n.parent != nil && i.pos >= i.n.count {
+			i.pos = i.n.pos
+			i.n = i.n.parent
+		}
+		return
+	}
+
+	i.n = i.n.children[i.pos+1]
+	for !i.n.leaf {
+		i.n = i.n.children[0]
+	}
+	i.pos = 0
+}
+
+// Prev ...
+func (i *iterator) Prev() {
+	if i.n == nil {
+		return
+	}
+
+	if i.n.leaf {
+		i.pos--
+		if i.pos >= 0 {
+			return
+		}
+		for i.n.parent != nil && i.pos < 0 {
+			i.pos = i.n.pos - 1
+			i.n = i.n.parent
+		}
+		return
+	}
+
+	i.n = i.n.children[i.pos]
+	for !i.n.leaf {
+		i.n = i.n.children[i.n.count]
+	}
+	i.pos = i.n.count - 1
+}
+
+// Valid ...
+func (i *iterator) Valid() bool {
+	return i.pos >= 0 && i.pos < i.n.count
+}
+
+// Cmd ...
+func (i *iterator) Cmd() *cmd {
+	return i.n.cmds[i.pos]
+}
diff --git a/pkg/storage/cmdq/interval_btree_test.go b/pkg/storage/cmdq/interval_btree_test.go
new file mode 100644
index 000000000000..5e8920834d94
--- /dev/null
+++ b/pkg/storage/cmdq/interval_btree_test.go
@@ -0,0 +1,316 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package cmdq
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+)
+
+func (n *node) verify() {
+	if root := n.parent == nil; !root {
+		if n.count < minCmds || maxCmds < n.count {
+			panic(fmt.Sprintf("command count %d outside range [%d,%d]", n.count, minCmds, maxCmds))
+		}
+	}
+	for i := int16(1); i < n.count; i++ {
+		if cmp(n.cmds[i-1], n.cmds[i]) >= 0 {
+			panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
+				i, n.cmds[i-1], n.cmds[i]))
+		}
+	}
+	if !n.leaf {
+		for i := int16(0); i < n.count; i++ {
+			prev := n.children[i]
+
+			if cmp(prev.cmds[prev.count-1], n.cmds[i]) >= 0 {
+				panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
+					i, n.cmds[i], prev.cmds[prev.count-1]))
+			}
+			next := n.children[i+1]
+			if cmp(n.cmds[i], next.cmds[0]) >= 0 {
+				panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
+					i, n.cmds[i], next.cmds[0]))
+			}
+		}
+		for i := int16(0); i <= n.count; i++ {
+			if n.children[i].pos != i {
+				panic(fmt.Sprintf("child has incorrect pos: %d != %d/%d", n.children[i].pos, i, n.count))
+			}
+			if n.children[i].parent != n {
+				panic(fmt.Sprintf("child does not point to parent: %d/%d", i, n.count))
+			}
+			n.children[i].verify()
+		}
+	}
+}
+
+// Verify asserts that the tree's structural invariants all hold.
+func (t *btree) Verify() {
+	if t.root == nil {
+		return
+	}
+	t.root.verify()
+}
+
+func key(i int) roachpb.Key {
+	return []byte(fmt.Sprintf("%04d", i))
+}
+
+func newCmd(k roachpb.Key) *cmd {
+	return &cmd{span: roachpb.Span{Key: k}}
+}
+
+func checkIter(t *testing.T, it iterator, start, end int) {
+	t.Helper()
+	i := start
+	for it.First(); it.Valid(); it.Next() {
+		cmd := it.Cmd()
+		expected := key(i)
+		if !expected.Equal(cmd.span.Key) {
+			t.Fatalf("expected %s, but found %s", expected, cmd.span.Key)
+		}
+		i++
+	}
+	if i != end {
+		t.Fatalf("expected %d, but at %d", end, i)
+	}
+
+	for it.Last(); it.Valid(); it.Prev() {
+		i--
+		cmd := it.Cmd()
+		expected := key(i)
+		if !expected.Equal(cmd.span.Key) {
+			t.Fatalf("expected %s, but found %s", expected, cmd.span.Key)
+		}
+	}
+	if i != start {
+		t.Fatalf("expected %d, but at %d: %+v parent=%p", start, i, it, it.n.parent)
+	}
+}
+
+func TestBTree(t *testing.T) {
+	var tr btree
+
+	// With degree == 16 (max-items/node == 31) we need 513 items in order for
+	// there to be 3 levels in the tree. The count here is comfortably above
+	// that.
+	const count = 768
+	// Add keys in sorted order.
+	for i := 0; i < count; i++ {
+		tr.Set(newCmd(key(i)))
+		tr.Verify()
+		if e := i + 1; e != tr.Len() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Len())
+		}
+		checkIter(t, tr.MakeIter(), 0, i+1)
+	}
+	// Delete keys in sorted order.
+	for i := 0; i < count; i++ {
+		tr.Delete(newCmd(key(i)))
+		tr.Verify()
+		if e := count - (i + 1); e != tr.Len() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Len())
+		}
+		checkIter(t, tr.MakeIter(), i+1, count)
+	}
+
+	// Add keys in reverse sorted order.
+	for i := 0; i < count; i++ {
+		tr.Set(newCmd(key(count - i)))
+		tr.Verify()
+		if e := i + 1; e != tr.Len() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Len())
+		}
+		checkIter(t, tr.MakeIter(), count-i, count+1)
+	}
+	// Delete keys in reverse sorted order.
+	for i := 0; i < count; i++ {
+		tr.Delete(newCmd(key(count - i)))
+		tr.Verify()
+		if e := count - (i + 1); e != tr.Len() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Len())
+		}
+		checkIter(t, tr.MakeIter(), 1, count-i)
+	}
+}
+
+func TestBTreeSeek(t *testing.T) {
+	const count = 513
+
+	var tr btree
+	for i := 0; i < count; i++ {
+		tr.Set(newCmd(key(i * 2)))
+	}
+
+	it := tr.MakeIter()
+	for i := 0; i < 2*count-1; i++ {
+		it.SeekGE(newCmd(key(i)))
+		if !it.Valid() {
+			t.Fatalf("%d: expected valid iterator", i)
+		}
+		cmd := it.Cmd()
+		expected := key(2 * ((i + 1) / 2))
+		if !expected.Equal(cmd.span.Key) {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span.Key)
+		}
+	}
+	it.SeekGE(newCmd(key(2*count - 1)))
+	if it.Valid() {
+		t.Fatalf("expected invalid iterator")
+	}
+
+	for i := 1; i < 2*count; i++ {
+		it.SeekLT(newCmd(key(i)))
+		if !it.Valid() {
+			t.Fatalf("%d: expected valid iterator", i)
+		}
+		cmd := it.Cmd()
+		expected := key(2 * ((i - 1) / 2))
+		if !expected.Equal(cmd.span.Key) {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span.Key)
+		}
+	}
+	it.SeekLT(newCmd(key(0)))
+	if it.Valid() {
+		t.Fatalf("expected invalid iterator")
+	}
+}
+
+func randomKey(rng *rand.Rand, b []byte) []byte {
+	key := rng.Uint32()
+	key2 := rng.Uint32()
+	binary.LittleEndian.PutUint32(b, key)
+	binary.LittleEndian.PutUint32(b[4:], key2)
+	return b
+}
+
+func BenchmarkIterSeekGE(b *testing.B) {
+	for _, count := range []int{16, 128, 1024} {
+		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
+			var keys [][]byte
+			var tr btree
+
+			for i := 0; i < count; i++ {
+				key := []byte(fmt.Sprintf("%05d", i))
+				keys = append(keys, key)
+				tr.Set(newCmd(key))
+			}
+
+			rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+			it := tr.MakeIter()
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				k := keys[rng.Intn(len(keys))]
+				it.SeekGE(newCmd(k))
+				if testing.Verbose() {
+					if !it.Valid() {
+						b.Fatal("expected to find key")
+					}
+					if !bytes.Equal(k, it.Cmd().span.Key) {
+						b.Fatalf("expected %s, but found %s", k, it.Cmd().span.Key)
+					}
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkIterSeekLT(b *testing.B) {
+	for _, count := range []int{16, 128, 1024} {
+		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
+			var keys [][]byte
+			var tr btree
+
+			for i := 0; i < count; i++ {
+				key := []byte(fmt.Sprintf("%05d", i))
+				keys = append(keys, key)
+				tr.Set(newCmd(key))
+			}
+
+			rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+			it := tr.MakeIter()
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				j := rng.Intn(len(keys))
+				k := keys[j]
+				it.SeekLT(newCmd(k))
+				if testing.Verbose() {
+					if j == 0 {
+						if it.Valid() {
+							b.Fatal("unexpected key")
+						}
+					} else {
+						if !it.Valid() {
+							b.Fatal("expected to find key")
+						}
+						k := keys[j-1]
+						if !bytes.Equal(k, it.Cmd().span.Key) {
+							b.Fatalf("expected %s, but found %s", k, it.Cmd().span.Key)
+						}
+					}
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkIterNext(b *testing.B) {
+	buf := make([]byte, 64<<10)
+	var tr btree
+
+	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+	for i := 0; i < len(buf); i += 8 {
+		key := randomKey(rng, buf[i:i+8])
+		tr.Set(newCmd(key))
+	}
+
+	it := tr.MakeIter()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.Valid() {
+			it.First()
+		}
+		it.Next()
+	}
+}
+
+func BenchmarkIterPrev(b *testing.B) {
+	buf := make([]byte, 64<<10)
+	var tr btree
+
+	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+	for i := 0; i < len(buf); i += 8 {
+		key := randomKey(rng, buf[i:i+8])
+		tr.Set(newCmd(key))
+	}
+
+	it := tr.MakeIter()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.Valid() {
+			it.Last()
+		}
+		it.Prev()
+	}
+}

From babda45cea190f2f3e0f419ed1c58882133265a8 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 5 Nov 2018 13:57:15 -0500
Subject: [PATCH 09/11] storage/cmdq: make btree an augmented interval tree
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit modifies the btree type added in the previous commit
and turns it into an augmented interval tree. The tree represents
intervals and permits an interval search operation following the
approach laid out in CLRS, Chapter 14. The B-Tree stores cmds in
order based on their start key and each B-Tree node maintains the
upper-bound end key of all cmds in its subtree. This is close to
what `util/interval.btree` does, although the new version doesn't
maintain the lower-bound start key of all cmds in each node.

The new interval btree is significantly faster than both the old
interval btree and the old interval llrb tree because it minimizes
key comparisons while scanning for overlaps. This includes avoiding
all key comparisons for cmds with start keys that are greater than
the search range's start key. See the comment on `overlapScan` for
an explanation of how this is possible.

The new interval btree is also faster because it has been specialized
for the `storage/cmdq` package. This allows it to avoid interfaces
and dynamic dispatch throughout its operations, which showed up
prominently on profiles of the other two implementations.

A third benefit of the rewrite is that it inherits the optimizations
made in pebble's btree. This includes inlining the btree items and
child pointers in nodes instead of using slices.

_### Benchmarks:

```
Insert/count=16-4               76.1ns ± 4%
Insert/count=128-4               156ns ± 4%
Insert/count=1024-4              259ns ± 8%
Insert/count=8192-4              386ns ± 1%
Insert/count=65536-4             735ns ± 5%
Delete/count=16-4                129ns ±16%
Delete/count=128-4               189ns ±12%
Delete/count=1024-4              338ns ± 7%
Delete/count=8192-4              547ns ± 4%
Delete/count=65536-4            1.22µs ±12%
DeleteInsert/count=16-4          168ns ± 2%
DeleteInsert/count=128-4         375ns ± 8%
DeleteInsert/count=1024-4        562ns ± 1%
DeleteInsert/count=8192-4        786ns ± 3%
DeleteInsert/count=65536-4      2.31µs ±26%
IterSeekGE/count=16-4           87.2ns ± 3%
IterSeekGE/count=128-4           141ns ± 3%
IterSeekGE/count=1024-4          227ns ± 4%
IterSeekGE/count=8192-4          379ns ± 2%
IterSeekGE/count=65536-4         882ns ± 1%
IterSeekLT/count=16-4           89.5ns ± 3%
IterSeekLT/count=128-4           145ns ± 1%
IterSeekLT/count=1024-4          226ns ± 6%
IterSeekLT/count=8192-4          379ns ± 1%
IterSeekLT/count=65536-4         891ns ± 1%
IterFirstOverlap/count=16-4      184ns ± 1%
IterFirstOverlap/count=128-4     260ns ± 3%
IterFirstOverlap/count=1024-4    685ns ± 7%
IterFirstOverlap/count=8192-4   1.23µs ± 2%
IterFirstOverlap/count=65536-4  2.14µs ± 1%
IterNext-4                      3.82ns ± 2%
IterPrev-4                      14.8ns ± 2%
IterNextOverlap-4               8.57ns ± 2%
IterOverlapScan-4               25.8µs ± 3%
```

Compared to old llrb interval tree (currently in use):
```
Insert/count=16-4            323ns ± 7%    76ns ± 4%  -76.43%  (p=0.008 n=5+5)
Insert/count=128-4           539ns ± 2%   156ns ± 4%  -71.05%  (p=0.008 n=5+5)
Insert/count=1024-4          797ns ± 1%   259ns ± 8%  -67.52%  (p=0.008 n=5+5)
Insert/count=8192-4         1.30µs ± 5%  0.39µs ± 1%  -70.38%  (p=0.008 n=5+5)
Insert/count=65536-4        2.69µs ±11%  0.74µs ± 5%  -72.65%  (p=0.008 n=5+5)
Delete/count=16-4            438ns ± 7%   129ns ±16%  -70.44%  (p=0.008 n=5+5)
Delete/count=128-4           785ns ± 6%   189ns ±12%  -75.89%  (p=0.008 n=5+5)
Delete/count=1024-4         1.38µs ± 2%  0.34µs ± 7%  -75.44%  (p=0.008 n=5+5)
Delete/count=8192-4         2.36µs ± 2%  0.55µs ± 4%  -76.82%  (p=0.008 n=5+5)
Delete/count=65536-4        4.73µs ±13%  1.22µs ±12%  -74.19%  (p=0.008 n=5+5)
DeleteInsert/count=16-4      920ns ± 2%   168ns ± 2%  -81.76%  (p=0.008 n=5+5)
DeleteInsert/count=128-4    1.73µs ± 4%  0.37µs ± 8%  -78.35%  (p=0.008 n=5+5)
DeleteInsert/count=1024-4   2.69µs ± 3%  0.56µs ± 1%  -79.15%  (p=0.016 n=5+4)
DeleteInsert/count=8192-4   4.55µs ±25%  0.79µs ± 3%  -82.70%  (p=0.008 n=5+5)
DeleteInsert/count=65536-4  7.53µs ± 6%  2.31µs ±26%  -69.32%  (p=0.008 n=5+5)
IterOverlapScan-4            285µs ± 7%    26µs ± 3%  -90.96%  (p=0.008 n=5+5)
```

Compared to old btree interval tree (added in a61191e, never enabled):
```
Insert/count=16-4            231ns ± 1%    76ns ± 4%  -66.99%  (p=0.008 n=5+5)
Insert/count=128-4           351ns ± 2%   156ns ± 4%  -55.53%  (p=0.008 n=5+5)
Insert/count=1024-4          515ns ± 5%   259ns ± 8%  -49.73%  (p=0.008 n=5+5)
Insert/count=8192-4          786ns ± 3%   386ns ± 1%  -50.85%  (p=0.008 n=5+5)
Insert/count=65536-4        1.50µs ± 3%  0.74µs ± 5%  -50.97%  (p=0.008 n=5+5)
Delete/count=16-4            363ns ±11%   129ns ±16%  -64.33%  (p=0.008 n=5+5)
Delete/count=128-4           466ns ± 9%   189ns ±12%  -59.42%  (p=0.008 n=5+5)
Delete/count=1024-4          806ns ± 6%   338ns ± 7%  -58.01%  (p=0.008 n=5+5)
Delete/count=8192-4         1.43µs ±13%  0.55µs ± 4%  -61.71%  (p=0.008 n=5+5)
Delete/count=65536-4        2.75µs ± 1%  1.22µs ±12%  -55.57%  (p=0.008 n=5+5)
DeleteInsert/count=16-4      557ns ± 1%   168ns ± 2%  -69.87%  (p=0.008 n=5+5)
DeleteInsert/count=128-4     953ns ± 8%   375ns ± 8%  -60.71%  (p=0.008 n=5+5)
DeleteInsert/count=1024-4   1.19µs ± 4%  0.56µs ± 1%  -52.72%  (p=0.016 n=5+4)
DeleteInsert/count=8192-4   1.84µs ±17%  0.79µs ± 3%  -57.22%  (p=0.008 n=5+5)
DeleteInsert/count=65536-4  3.20µs ± 3%  2.31µs ±26%  -27.86%  (p=0.008 n=5+5)
IterOverlapScan-4           70.1µs ± 2%  25.8µs ± 3%  -63.23%  (p=0.008 n=5+5)
```

Release note: None
---
 pkg/storage/cmdq/interval_btree.go            | 540 +++++++++++--
 pkg/storage/cmdq/interval_btree_test.go       | 738 +++++++++++++++---
 .../interval/btree_based_interval_test.go     | 284 ++++---
 3 files changed, 1276 insertions(+), 286 deletions(-)

diff --git a/pkg/storage/cmdq/interval_btree.go b/pkg/storage/cmdq/interval_btree.go
index c7ce7e47d851..f1b6cc55d4d2 100644
--- a/pkg/storage/cmdq/interval_btree.go
+++ b/pkg/storage/cmdq/interval_btree.go
@@ -15,14 +15,15 @@
 package cmdq
 
 import (
+	"bytes"
 	"sort"
+	"strings"
 	"unsafe"
 
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 )
 
 // TODO(nvanbenschoten):
-// 1. Make the structure an augmented interval tree
 // 2. Add synchronized node and leafNode freelists
 // 3. Introduce immutability and a copy-on-write policy:
 // 4. Describe pedigree, changes, etc. of this implementation
@@ -39,20 +40,27 @@ type cmd struct {
 	span roachpb.Span
 }
 
-// cmp returns a value indicating the sort order relationship between a and b.
-// The comparison is performed lexicographically on (a.span.Key, a.id) and
-// (b.span.Key, b.id) tuples where span.Key is more significant that id.
+// cmp returns a value indicating the sort order relationship between
+// a and b. The comparison is performed lexicographically on
+//  (a.span.Key, a.span.EndKey, a.id)
+// and
+//  (b.span.Key, b.span.EndKey, b.id)
+// tuples.
 //
 // Given c = cmp(a, b):
 //
-//  c == -1  if (a.span.Key, a.id) < (b.span.Key, b.id);
-//  c ==  0  if (a.span.Key, a.id) == (b.span.Key, b.id); and
-//  c ==  1  if (a.span.Key, a.id) > (b.span.Key, b.id).
+//  c == -1  if (a.span.Key, a.span.EndKey, a.id) <  (b.span.Key, b.span.EndKey, b.id)
+//  c ==  0  if (a.span.Key, a.span.EndKey, a.id) == (b.span.Key, b.span.EndKey, b.id)
+//  c ==  1  if (a.span.Key, a.span.EndKey, a.id) >  (b.span.Key, b.span.EndKey, b.id)
 //
 func cmp(a, b *cmd) int {
-	k := a.span.Key.Compare(b.span.Key)
-	if k != 0 {
-		return k
+	c := bytes.Compare(a.span.Key, b.span.Key)
+	if c != 0 {
+		return c
+	}
+	c = bytes.Compare(a.span.EndKey, b.span.EndKey)
+	if c != 0 {
+		return c
 	}
 	if a.id < b.id {
 		return -1
@@ -63,8 +71,44 @@ func cmp(a, b *cmd) int {
 	}
 }
 
+// keyBound represents the upper-bound of a key range.
+type keyBound struct {
+	key roachpb.Key
+	inc bool
+}
+
+func (b keyBound) compare(o keyBound) int {
+	c := bytes.Compare(b.key, o.key)
+	if c != 0 {
+		return c
+	}
+	if b.inc == o.inc {
+		return 0
+	}
+	if b.inc {
+		return 1
+	}
+	return -1
+}
+
+func (b keyBound) contains(a *cmd) bool {
+	c := bytes.Compare(a.span.Key, b.key)
+	if c == 0 {
+		return b.inc
+	}
+	return c < 0
+}
+
+func upperBound(c *cmd) keyBound {
+	if len(c.span.EndKey) != 0 {
+		return keyBound{key: c.span.EndKey}
+	}
+	return keyBound{key: c.span.Key, inc: true}
+}
+
 type leafNode struct {
 	parent *node
+	max    keyBound
 	pos    int16
 	count  int16
 	leaf   bool
@@ -176,11 +220,20 @@ func (n *node) popFront() (*cmd, *node) {
 // list. 'found' is true if the cmd already exists in the list at the given
 // index.
 func (n *node) find(c *cmd) (index int, found bool) {
-	i := sort.Search(int(n.count), func(i int) bool {
-		return cmp(c, n.cmds[i]) < 0
-	})
-	if i > 0 && cmp(n.cmds[i-1], c) == 0 {
-		return i - 1, true
+	// Logic copied from sort.Search. Inlining this gave
+	// an 11% speedup on BenchmarkBTreeDeleteInsert.
+	i, j := 0, int(n.count)
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+		v := cmp(c, n.cmds[h])
+		if v == 0 {
+			return h, true
+		} else if v > 0 {
+			i = h + 1
+		} else {
+			j = h
+		}
 	}
 	return i, false
 }
@@ -188,6 +241,24 @@ func (n *node) find(c *cmd) (index int, found bool) {
 // split splits the given node at the given index. The current node shrinks,
 // and this function returns the cmd that existed at that index and a new node
 // containing all cmds/children after it.
+//
+// Before:
+//
+//          +-----------+
+//          |   x y z   |
+//          +--/-/-\-\--+
+//
+// After:
+//
+//          +-----------+
+//          |     y     |
+//          +----/-\----+
+//              /   \
+//             v     v
+// +-----------+     +-----------+
+// |         x |     | z         |
+// +-----------+     +-----------+
+//
 func (n *node) split(i int) (*cmd, *node) {
 	out := n.cmds[i]
 	var next *node
@@ -203,7 +274,7 @@ func (n *node) split(i int) (*cmd, *node) {
 	}
 	if !n.leaf {
 		copy(next.children[:], n.children[i+1:n.count+1])
-		for j := int16(i + 1); j < n.count; j++ {
+		for j := int16(i + 1); j <= n.count; j++ {
 			n.children[j] = nil
 		}
 		for j := int16(0); j <= next.count; j++ {
@@ -212,21 +283,30 @@ func (n *node) split(i int) (*cmd, *node) {
 		}
 	}
 	n.count = int16(i)
+
+	next.max = next.findUpperBound()
+	if n.max.compare(next.max) != 0 && n.max.compare(upperBound(out)) != 0 {
+		// If upper bound wasn't from new node or cmd
+		// at index i, it must still be from old node.
+	} else {
+		n.max = n.findUpperBound()
+	}
 	return out, next
 }
 
-// insert inserts a cmd into the subtree rooted at this node, making sure
-// no nodes in the subtree exceed maxCmds cmds. Returns true if a cmd was
-// inserted and false if an existing cmd was replaced.
-func (n *node) insert(c *cmd) bool {
+// insert inserts a cmd into the subtree rooted at this node, making sure no
+// nodes in the subtree exceed maxCmds cmds. Returns true if an existing cmd was
+// replaced and false if a command was inserted. Also returns whether the node's
+// upper bound changes.
+func (n *node) insert(c *cmd) (replaced, newBound bool) {
 	i, found := n.find(c)
 	if found {
 		n.cmds[i] = c
-		return false
+		return true, false
 	}
 	if n.leaf {
 		n.insertAt(i, c, nil)
-		return true
+		return false, n.adjustUpperBoundOnInsertion(c, nil)
 	}
 	if n.children[i].count >= maxCmds {
 		splitcmd, splitNode := n.children[i].split(maxCmds / 2)
@@ -239,17 +319,24 @@ func (n *node) insert(c *cmd) bool {
 			i++ // we want second split node
 		default:
 			n.cmds[i] = c
-			return false
+			return true, false
 		}
 	}
-	return n.children[i].insert(c)
+	replaced, newBound = n.children[i].insert(c)
+	if newBound {
+		newBound = n.adjustUpperBoundOnInsertion(c, nil)
+	}
+	return replaced, newBound
 }
 
+// removeMax removes and returns the maximum cmd from the subtree rooted at
+// this node.
 func (n *node) removeMax() *cmd {
 	if n.leaf {
 		n.count--
 		out := n.cmds[n.count]
 		n.cmds[n.count] = nil
+		n.adjustUpperBoundOnRemoval(out, nil)
 		return out
 	}
 	child := n.children[n.count]
@@ -260,50 +347,143 @@ func (n *node) removeMax() *cmd {
 	return child.removeMax()
 }
 
-// remove removes a cmd from the subtree rooted at this node.
-func (n *node) remove(c *cmd) (*cmd, bool) {
+// remove removes a cmd from the subtree rooted at this node. Returns
+// the cmd that was removed or nil if no matching command was found.
+// Also returns whether the node's upper bound changes.
+func (n *node) remove(c *cmd) (out *cmd, newBound bool) {
 	i, found := n.find(c)
 	if n.leaf {
 		if found {
-			cmd, _ := n.removeAt(i)
-			return cmd, true
+			out, _ = n.removeAt(i)
+			return out, n.adjustUpperBoundOnRemoval(out, nil)
 		}
 		return nil, false
 	}
 	child := n.children[i]
 	if child.count <= minCmds {
+		// Child not large enough to remove from.
 		n.rebalanceOrMerge(i)
 		return n.remove(c)
 	}
 	if found {
 		// Replace the cmd being removed with the max cmd in our left child.
-		out := n.cmds[i]
+		out = n.cmds[i]
 		n.cmds[i] = child.removeMax()
-		return out, true
+		return out, n.adjustUpperBoundOnRemoval(out, nil)
+	}
+	// Cmd is not in this node and child is large enough to remove from.
+	out, newBound = child.remove(c)
+	if newBound {
+		newBound = n.adjustUpperBoundOnRemoval(out, nil)
 	}
-	return child.remove(c)
+	return out, newBound
 }
 
+// rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove
+// a cmd from it while keeping it at or above minCmds.
 func (n *node) rebalanceOrMerge(i int) {
 	switch {
 	case i > 0 && n.children[i-1].count > minCmds:
 		// Rebalance from left sibling.
+		//
+		//          +-----------+
+		//          |     y     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         x |     |           |
+		// +----------\+     +-----------+
+		//             \
+		//              v
+		//              a
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |     x     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |           |     | y         |
+		// +-----------+     +/----------+
+		//                   /
+		//                  v
+		//                  a
+		//
 		left := n.children[i-1]
 		child := n.children[i]
-		cmd, grandChild := left.popBack()
-		child.pushFront(n.cmds[i-1], grandChild)
-		n.cmds[i-1] = cmd
+		xCmd, grandChild := left.popBack()
+		yCmd := n.cmds[i-1]
+		child.pushFront(yCmd, grandChild)
+		n.cmds[i-1] = xCmd
+
+		left.adjustUpperBoundOnRemoval(xCmd, grandChild)
+		child.adjustUpperBoundOnInsertion(yCmd, grandChild)
 
 	case i < int(n.count) && n.children[i+1].count > minCmds:
 		// Rebalance from right sibling.
+		//
+		//          +-----------+
+		//          |     y     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |           |     | x         |
+		// +-----------+     +/----------+
+		//                   /
+		//                  v
+		//                  a
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |     x     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         y |     |           |
+		// +----------\+     +-----------+
+		//             \
+		//              v
+		//              a
+		//
 		right := n.children[i+1]
 		child := n.children[i]
-		cmd, grandChild := right.popFront()
-		child.pushBack(n.cmds[i], grandChild)
-		n.cmds[i] = cmd
+		xCmd, grandChild := right.popFront()
+		yCmd := n.cmds[i]
+		child.pushBack(yCmd, grandChild)
+		n.cmds[i] = xCmd
+
+		right.adjustUpperBoundOnRemoval(xCmd, grandChild)
+		child.adjustUpperBoundOnInsertion(yCmd, grandChild)
 
 	default:
 		// Merge with either the left or right sibling.
+		//
+		//          +-----------+
+		//          |   u y v   |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         x |     | z         |
+		// +-----------+     +-----------+
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |    u v    |
+		//          +-----|-----+
+		//                |
+		//                v
+		//          +-----------+
+		//          |   x y z   |
+		//          +-----------+
+		//
 		if i >= int(n.count) {
 			i = int(n.count - 1)
 		}
@@ -319,13 +499,73 @@ func (n *node) rebalanceOrMerge(i int) {
 			child.updatePos(int(child.count+1), int(child.count+mergeChild.count+2))
 		}
 		child.count += mergeChild.count + 1
+
+		child.adjustUpperBoundOnInsertion(mergeCmd, mergeChild)
+	}
+}
+
+// findUpperBound returns the largest end key node range, assuming that its
+// children have correct upper bounds already set.
+func (n *node) findUpperBound() keyBound {
+	var max keyBound
+	for i := int16(0); i < n.count; i++ {
+		up := upperBound(n.cmds[i])
+		if max.compare(up) < 0 {
+			max = up
+		}
+	}
+	if !n.leaf {
+		for i := int16(0); i <= n.count; i++ {
+			up := n.children[i].max
+			if max.compare(up) < 0 {
+				max = up
+			}
+		}
 	}
+	return max
 }
 
-// btree is an implementation of a B-Tree.
+// adjustUpperBoundOnInsertion adjusts the upper key bound for this node
+// given a cmd and an optional child node that was inserted. Returns true
+// is the upper bound was changed and false if not.
+func (n *node) adjustUpperBoundOnInsertion(c *cmd, child *node) bool {
+	up := upperBound(c)
+	if child != nil {
+		if up.compare(child.max) < 0 {
+			up = child.max
+		}
+	}
+	if n.max.compare(up) < 0 {
+		n.max = up
+		return true
+	}
+	return false
+}
+
+// adjustUpperBoundOnRemoval adjusts the upper key bound for this node
+// given a cmd and an optional child node that were removed. Returns true
+// is the upper bound was changed and false if not.
+func (n *node) adjustUpperBoundOnRemoval(c *cmd, child *node) bool {
+	up := upperBound(c)
+	if child != nil {
+		if up.compare(child.max) < 0 {
+			up = child.max
+		}
+	}
+	if n.max.compare(up) == 0 {
+		n.max = n.findUpperBound()
+		return true
+	}
+	return false
+}
+
+// btree is an implementation of an augmented interval B-Tree.
 //
-// btree stores keys in an ordered structure, allowing easy insertion,
-// removal, and iteration.
+// btree stores cmds in an ordered structure, allowing easy insertion,
+// removal, and iteration. It represents intervals and permits an interval
+// search operation following the approach laid out in CLRS, Chapter 14.
+// The B-Tree stores cmds in order based on their start key and each B-Tree
+// node maintains the upper-bound end key of all cmds in its subtree.
 //
 // Write operations are not safe for concurrent mutation by multiple
 // goroutines, but Read operations are.
@@ -348,7 +588,7 @@ func (t *btree) Delete(c *cmd) {
 	if t.root == nil || t.root.count == 0 {
 		return
 	}
-	if _, found := t.root.remove(c); found {
+	if out, _ := t.root.remove(c); out != nil {
 		t.length--
 	}
 	if t.root.count == 0 && !t.root.leaf {
@@ -369,13 +609,14 @@ func (t *btree) Set(c *cmd) {
 		newRoot.cmds[0] = splitcmd
 		newRoot.children[0] = t.root
 		newRoot.children[1] = splitNode
+		newRoot.max = newRoot.findUpperBound()
 		t.root.parent = newRoot
 		t.root.pos = 0
 		splitNode.parent = newRoot
 		splitNode.pos = 1
 		t.root = newRoot
 	}
-	if t.root.insert(c) {
+	if replaced, _ := t.root.insert(c); !replaced {
 		t.length++
 	}
 }
@@ -405,19 +646,52 @@ func (t *btree) Len() int {
 	return t.length
 }
 
-// iterator ...
+// String returns a string description of the tree. The format is
+// similar to the https://en.wikipedia.org/wiki/Newick_format.
+func (t *btree) String() string {
+	if t.length == 0 {
+		return ";"
+	}
+	var b strings.Builder
+	t.root.writeString(&b)
+	return b.String()
+}
+
+func (n *node) writeString(b *strings.Builder) {
+	if n.leaf {
+		for i := int16(0); i < n.count; i++ {
+			if i != 0 {
+				b.WriteString(",")
+			}
+			b.WriteString(n.cmds[i].span.String())
+		}
+		return
+	}
+	for i := int16(0); i <= n.count; i++ {
+		b.WriteString("(")
+		n.children[i].writeString(b)
+		b.WriteString(")")
+		if i < n.count {
+			b.WriteString(n.cmds[i].span.String())
+		}
+	}
+}
+
+// iterator is responsible for search and traversal within a btree.
 type iterator struct {
 	t   *btree
 	n   *node
 	pos int16
+	o   overlapScan
 }
 
-// SeekGE ...
+// SeekGE seeks to the first cmd greater-than or equal to the provided cmd.
 func (i *iterator) SeekGE(c *cmd) {
 	i.n = i.t.root
 	if i.n == nil {
 		return
 	}
+	i.o = overlapScan{}
 	for {
 		pos, found := i.n.find(c)
 		i.pos = int16(pos)
@@ -434,12 +708,13 @@ func (i *iterator) SeekGE(c *cmd) {
 	}
 }
 
-// SeekLT ...
+// SeekLT seeks to the first cmd less-than the provided cmd.
 func (i *iterator) SeekLT(c *cmd) {
 	i.n = i.t.root
 	if i.n == nil {
 		return
 	}
+	i.o = overlapScan{}
 	for {
 		pos, found := i.n.find(c)
 		i.pos = int16(pos)
@@ -451,7 +726,7 @@ func (i *iterator) SeekLT(c *cmd) {
 	}
 }
 
-// First ...
+// First seeks to the first cmd in the btree.
 func (i *iterator) First() {
 	i.n = i.t.root
 	if i.n == nil {
@@ -460,10 +735,11 @@ func (i *iterator) First() {
 	for !i.n.leaf {
 		i.n = i.n.children[0]
 	}
+	i.o = overlapScan{}
 	i.pos = 0
 }
 
-// Last ...
+// Last seeks to the last cmd in the btree.
 func (i *iterator) Last() {
 	i.n = i.t.root
 	if i.n == nil {
@@ -472,10 +748,12 @@ func (i *iterator) Last() {
 	for !i.n.leaf {
 		i.n = i.n.children[i.n.count]
 	}
+	i.o = overlapScan{}
 	i.pos = i.n.count - 1
 }
 
-// Next ...
+// Next positions the iterator to the cmd immediately following
+// its current position.
 func (i *iterator) Next() {
 	if i.n == nil {
 		return
@@ -500,7 +778,8 @@ func (i *iterator) Next() {
 	i.pos = 0
 }
 
-// Prev ...
+// Prev positions the iterator to the cmd immediately preceding
+// its current position.
 func (i *iterator) Prev() {
 	if i.n == nil {
 		return
@@ -525,12 +804,171 @@ func (i *iterator) Prev() {
 	i.pos = i.n.count - 1
 }
 
-// Valid ...
+// Valid returns whether the iterator is positioned at a valid position.
 func (i *iterator) Valid() bool {
 	return i.pos >= 0 && i.pos < i.n.count
 }
 
-// Cmd ...
+// Cmd returns the cmd at the iterator's current position. It is illegal
+// to call Cmd if the iterator is not valid.
 func (i *iterator) Cmd() *cmd {
 	return i.n.cmds[i.pos]
 }
+
+// An overlap scan is a scan over all cmds that overlap with the provided cmd
+// in order of the overlapping cmds' start keys. The goal of the scan is to
+// minimize the number of key comparisons performed in total. The algorithm
+// operates based on the following two invariants maintained by augmented
+// interval btree:
+// 1. all cmds are sorted in the btree based on their start key.
+// 2. all btree nodes maintain the upper bound end key of all cmds
+//    in their subtree.
+//
+// The scan algorithm starts in "unconstrained minimum" and "unconstrained
+// maximum" states. To enter a "constrained minimum" state, the scan must reach
+// cmds in the tree with start keys above the search range's start key. Because
+// cmds in the tree are sorted by start key, once the scan enters the
+// "constrained minimum" state it will remain there. To enter a "constrained
+// maximum" state, the scan must determine the first child btree node in a given
+// subtree that can have cmds with start keys above the search range's end key.
+// The scan then remains in the "constrained maximum" state until it traverse
+// into this child node, at which point it moves to the "unconstrained maximum"
+// state again.
+//
+// The scan algorithm works like a standard btree forward scan with the
+// following augmentations:
+// 1. before tranversing the tree, the scan performs a binary search on the
+//    root node's items to determine a "soft" lower-bound constraint position
+//    and a "hard" upper-bound constraint position in the root's children.
+// 2. when tranversing into a child node in the lower or upper bound constraint
+//    position, the constraint is refined by searching the child's items.
+// 3. the initial traversal down the tree follows the left-most children
+//    whose upper bound end keys are equal to or greater than the start key
+//    of the search range. The children followed will be equal to or less
+//    than the soft lower bound constraint.
+// 4. once the initial tranversal completes and the scan is in the left-most
+//    btree node whose upper bound overlaps the search range, key comparisons
+//    must be performed with each cmd in the tree. This is necessary because
+//    any of these cmds may have end keys that cause them to overlap with the
+//    search range.
+// 5. once the scan reaches the lower bound constraint position (the first cmd
+//    with a start key equal to or greater than the search range's start key),
+//    it can begin scaning without performing key comparisons. This is allowed
+//    because all commands from this point forward will have end keys that are
+//    greater than the search range's start key.
+// 6. once the scan reaches the upper bound constraint position, it terminates.
+//    It does so because the cmd at this position is the first cmd with a start
+//    key larger than the search range's end key.
+type overlapScan struct {
+	c *cmd // search cmd
+
+	// The "soft" lower-bound constraint.
+	constrMinN       *node
+	constrMinPos     int16
+	constrMinReached bool
+
+	// The "hard" upper-bound constraint.
+	constrMaxN   *node
+	constrMaxPos int16
+}
+
+// FirstOverlap seeks to the first cmd in the btree that overlaps with the
+// provided search cmd.
+func (i *iterator) FirstOverlap(c *cmd) {
+	i.n = i.t.root
+	if i.n == nil {
+		return
+	}
+	i.pos = 0
+	i.o = overlapScan{c: c}
+	i.constrainMinSearchBounds()
+	i.constrainMaxSearchBounds()
+	i.findNextOverlap()
+}
+
+// NextOverlap positions the iterator to the cmd immediately following
+// its current position that overlaps with the search cmd.
+func (i *iterator) NextOverlap() {
+	if i.n == nil {
+		return
+	}
+	if i.o.c == nil {
+		// Invalid. Mixed overlap scan with non-overlap scan.
+		i.pos = i.n.count
+		return
+	}
+	i.pos++
+	i.findNextOverlap()
+}
+
+func (i *iterator) constrainMinSearchBounds() {
+	j := sort.Search(int(i.n.count), func(j int) bool {
+		return bytes.Compare(i.o.c.span.Key, i.n.cmds[j].span.Key) <= 0
+	})
+	i.o.constrMinN = i.n
+	i.o.constrMinPos = int16(j)
+}
+
+func (i *iterator) constrainMaxSearchBounds() {
+	up := upperBound(i.o.c)
+	j := sort.Search(int(i.n.count), func(j int) bool {
+		return !up.contains(i.n.cmds[j])
+	})
+	i.o.constrMaxN = i.n
+	i.o.constrMaxPos = int16(j)
+}
+
+func (i *iterator) findNextOverlap() {
+	for {
+		if i.pos > i.n.count {
+			// Iterate up tree.
+			if i.n.parent == nil {
+				// Should have already hit upper-bound constraint.
+				panic("unreachable")
+			}
+			i.pos = i.n.pos
+			i.n = i.n.parent
+		} else if !i.n.leaf {
+			// Iterate down tree.
+			if i.o.constrMinReached || i.n.children[i.pos].max.contains(i.o.c) {
+				i.n = i.n.children[i.pos]
+				i.pos = 0
+
+				// Refine the constraint bounds, if necessary.
+				if i.n.parent == i.o.constrMinN && i.n.pos == i.o.constrMinPos {
+					i.constrainMinSearchBounds()
+				}
+				if i.n.parent == i.o.constrMaxN && i.n.pos == i.o.constrMaxPos {
+					i.constrainMaxSearchBounds()
+				}
+				continue
+			}
+		}
+
+		// Check search bounds.
+		if i.n == i.o.constrMaxN && i.pos == i.o.constrMaxPos {
+			// Invalid. Past possible overlaps.
+			i.pos = i.n.count
+			return
+		}
+		if i.n == i.o.constrMinN && i.pos == i.o.constrMinPos {
+			// The scan reached the soft lower-bound constraint.
+			i.o.constrMinReached = true
+		}
+
+		// Iterate across node.
+		if i.pos < i.n.count {
+			// Check for overlapping cmd.
+			if i.o.constrMinReached {
+				// Fast-path to avoid span comparison. i.o.constrMinReached
+				// tells us that all cmds have end keys above our search
+				// span's start key.
+				return
+			}
+			if upperBound(i.n.cmds[i.pos]).contains(i.o.c) {
+				return
+			}
+		}
+		i.pos++
+	}
+}
diff --git a/pkg/storage/cmdq/interval_btree_test.go b/pkg/storage/cmdq/interval_btree_test.go
index 5e8920834d94..8f13543c29d0 100644
--- a/pkg/storage/cmdq/interval_btree_test.go
+++ b/pkg/storage/cmdq/interval_btree_test.go
@@ -15,78 +15,192 @@
 package cmdq
 
 import (
-	"bytes"
-	"encoding/binary"
 	"fmt"
 	"math/rand"
 	"testing"
 
+	"github.com/stretchr/testify/require"
+
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 )
 
-func (n *node) verify() {
-	if root := n.parent == nil; !root {
-		if n.count < minCmds || maxCmds < n.count {
-			panic(fmt.Sprintf("command count %d outside range [%d,%d]", n.count, minCmds, maxCmds))
+//////////////////////////////////////////
+//        Invariant verification        //
+//////////////////////////////////////////
+
+// Verify asserts that the tree's structural invariants all hold.
+func (t *btree) Verify(tt *testing.T) {
+	if t.root == nil {
+		return
+	}
+	t.verifyLeafSameDepth(tt)
+	t.verifyParentAndPosSet(tt)
+	t.verifyCountAllowed(tt)
+	t.isSorted(tt)
+	t.isUpperBoundCorrect(tt)
+}
+
+func (t *btree) verifyLeafSameDepth(tt *testing.T) {
+	h := t.Height()
+	t.root.verifyDepthEqualToHeight(tt, 1, h)
+}
+
+func (n *node) verifyDepthEqualToHeight(t *testing.T, depth, height int) {
+	if n.leaf {
+		require.Equal(t, height, depth, "all leaves should have the same depth as the tree height")
+	}
+	n.recurse(func(child *node, _ int16) {
+		child.verifyDepthEqualToHeight(t, depth+1, height)
+	})
+}
+
+func (t *btree) verifyParentAndPosSet(tt *testing.T) {
+	t.root.verifyParentAndPosSet(tt, nil, 0)
+}
+
+func (n *node) verifyParentAndPosSet(t *testing.T, par *node, pos int16) {
+	require.Equal(t, par, n.parent)
+	require.Equal(t, pos, n.pos)
+	n.recurse(func(child *node, pos int16) {
+		child.verifyParentAndPosSet(t, n, pos)
+	})
+}
+
+func (t *btree) verifyCountAllowed(tt *testing.T) {
+	t.root.verifyCountAllowed(tt, true)
+}
+
+func (n *node) verifyCountAllowed(t *testing.T, root bool) {
+	if !root {
+		require.True(t, n.count >= minCmds, "cmd count %d must be in range [%d,%d]", n.count, minCmds, maxCmds)
+		require.True(t, n.count <= maxCmds, "cmd count %d must be in range [%d,%d]", n.count, minCmds, maxCmds)
+	}
+	for i, cmd := range n.cmds {
+		if i < int(n.count) {
+			require.NotNil(t, cmd, "cmd below count")
+		} else {
+			require.Nil(t, cmd, "cmd above count")
 		}
 	}
-	for i := int16(1); i < n.count; i++ {
-		if cmp(n.cmds[i-1], n.cmds[i]) >= 0 {
-			panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
-				i, n.cmds[i-1], n.cmds[i]))
+	if !n.leaf {
+		for i, child := range n.children {
+			if i <= int(n.count) {
+				require.NotNil(t, child, "node below count")
+			} else {
+				require.Nil(t, child, "node above count")
+			}
 		}
 	}
+	n.recurse(func(child *node, _ int16) {
+		child.verifyCountAllowed(t, false)
+	})
+}
+
+func (t *btree) isSorted(tt *testing.T) {
+	t.root.isSorted(tt)
+}
+
+func (n *node) isSorted(t *testing.T) {
+	for i := int16(1); i < n.count; i++ {
+		require.True(t, cmp(n.cmds[i-1], n.cmds[i]) <= 0)
+	}
 	if !n.leaf {
 		for i := int16(0); i < n.count; i++ {
 			prev := n.children[i]
-
-			if cmp(prev.cmds[prev.count-1], n.cmds[i]) >= 0 {
-				panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
-					i, n.cmds[i], prev.cmds[prev.count-1]))
-			}
 			next := n.children[i+1]
-			if cmp(n.cmds[i], next.cmds[0]) >= 0 {
-				panic(fmt.Sprintf("cmds are not sorted @ %d: %v >= %v",
-					i, n.cmds[i], next.cmds[0]))
-			}
+
+			require.True(t, cmp(prev.cmds[prev.count-1], n.cmds[i]) <= 0)
+			require.True(t, cmp(n.cmds[i], next.cmds[0]) <= 0)
 		}
+	}
+	n.recurse(func(child *node, _ int16) {
+		child.isSorted(t)
+	})
+}
+
+func (t *btree) isUpperBoundCorrect(tt *testing.T) {
+	t.root.isUpperBoundCorrect(tt)
+}
+
+func (n *node) isUpperBoundCorrect(t *testing.T) {
+	require.Equal(t, 0, n.findUpperBound().compare(n.max))
+	for i := int16(1); i < n.count; i++ {
+		require.True(t, upperBound(n.cmds[i]).compare(n.max) <= 0)
+	}
+	if !n.leaf {
 		for i := int16(0); i <= n.count; i++ {
-			if n.children[i].pos != i {
-				panic(fmt.Sprintf("child has incorrect pos: %d != %d/%d", n.children[i].pos, i, n.count))
-			}
-			if n.children[i].parent != n {
-				panic(fmt.Sprintf("child does not point to parent: %d/%d", i, n.count))
-			}
-			n.children[i].verify()
+			child := n.children[i]
+			require.True(t, child.max.compare(n.max) <= 0)
 		}
 	}
+	n.recurse(func(child *node, _ int16) {
+		child.isUpperBoundCorrect(t)
+	})
 }
 
-// Verify asserts that the tree's structural invariants all hold.
-func (t *btree) Verify() {
-	if t.root == nil {
-		return
+func (n *node) recurse(f func(child *node, pos int16)) {
+	if !n.leaf {
+		for i := int16(0); i <= n.count; i++ {
+			f(n.children[i], i)
+		}
 	}
-	t.root.verify()
 }
 
+//////////////////////////////////////////
+//              Unit Tests              //
+//////////////////////////////////////////
+
 func key(i int) roachpb.Key {
-	return []byte(fmt.Sprintf("%04d", i))
+	if i < 0 || i > 99999 {
+		panic("key out of bounds")
+	}
+	return []byte(fmt.Sprintf("%05d", i))
 }
 
-func newCmd(k roachpb.Key) *cmd {
-	return &cmd{span: roachpb.Span{Key: k}}
+func span(i int) roachpb.Span {
+	switch i % 10 {
+	case 0:
+		return roachpb.Span{Key: key(i)}
+	case 1:
+		return roachpb.Span{Key: key(i), EndKey: key(i).Next()}
+	case 2:
+		return roachpb.Span{Key: key(i), EndKey: key(i + 64)}
+	default:
+		return roachpb.Span{Key: key(i), EndKey: key(i + 4)}
+	}
+}
+
+func spanWithEnd(start, end int) roachpb.Span {
+	if start < end {
+		return roachpb.Span{Key: key(start), EndKey: key(end)}
+	} else if start == end {
+		return roachpb.Span{Key: key(start)}
+	} else {
+		panic("illegal span")
+	}
+}
+
+func randomSpan(rng *rand.Rand, n int) roachpb.Span {
+	start := rng.Intn(n)
+	end := rng.Intn(n + 1)
+	if end < start {
+		start, end = end, start
+	}
+	return spanWithEnd(start, end)
+}
+
+func newCmd(s roachpb.Span) *cmd {
+	return &cmd{span: s}
 }
 
 func checkIter(t *testing.T, it iterator, start, end int) {
-	t.Helper()
 	i := start
 	for it.First(); it.Valid(); it.Next() {
 		cmd := it.Cmd()
-		expected := key(i)
-		if !expected.Equal(cmd.span.Key) {
-			t.Fatalf("expected %s, but found %s", expected, cmd.span.Key)
+		expected := span(i)
+		if !expected.Equal(cmd.span) {
+			t.Fatalf("expected %s, but found %s", expected, cmd.span)
 		}
 		i++
 	}
@@ -97,14 +211,27 @@ func checkIter(t *testing.T, it iterator, start, end int) {
 	for it.Last(); it.Valid(); it.Prev() {
 		i--
 		cmd := it.Cmd()
-		expected := key(i)
-		if !expected.Equal(cmd.span.Key) {
-			t.Fatalf("expected %s, but found %s", expected, cmd.span.Key)
+		expected := span(i)
+		if !expected.Equal(cmd.span) {
+			t.Fatalf("expected %s, but found %s", expected, cmd.span)
 		}
 	}
 	if i != start {
 		t.Fatalf("expected %d, but at %d: %+v parent=%p", start, i, it, it.n.parent)
 	}
+
+	all := newCmd(spanWithEnd(start, end))
+	for it.FirstOverlap(all); it.Valid(); it.NextOverlap() {
+		cmd := it.Cmd()
+		expected := span(i)
+		if !expected.Equal(cmd.span) {
+			t.Fatalf("expected %s, but found %s", expected, cmd.span)
+		}
+		i++
+	}
+	if i != end {
+		t.Fatalf("expected %d, but at %d", end, i)
+	}
 }
 
 func TestBTree(t *testing.T) {
@@ -114,19 +241,21 @@ func TestBTree(t *testing.T) {
 	// there to be 3 levels in the tree. The count here is comfortably above
 	// that.
 	const count = 768
+
 	// Add keys in sorted order.
 	for i := 0; i < count; i++ {
-		tr.Set(newCmd(key(i)))
-		tr.Verify()
+		tr.Set(newCmd(span(i)))
+		tr.Verify(t)
 		if e := i + 1; e != tr.Len() {
 			t.Fatalf("expected length %d, but found %d", e, tr.Len())
 		}
 		checkIter(t, tr.MakeIter(), 0, i+1)
 	}
+
 	// Delete keys in sorted order.
 	for i := 0; i < count; i++ {
-		tr.Delete(newCmd(key(i)))
-		tr.Verify()
+		tr.Delete(newCmd(span(i)))
+		tr.Verify(t)
 		if e := count - (i + 1); e != tr.Len() {
 			t.Fatalf("expected length %d, but found %d", e, tr.Len())
 		}
@@ -135,17 +264,18 @@ func TestBTree(t *testing.T) {
 
 	// Add keys in reverse sorted order.
 	for i := 0; i < count; i++ {
-		tr.Set(newCmd(key(count - i)))
-		tr.Verify()
+		tr.Set(newCmd(span(count - i)))
+		tr.Verify(t)
 		if e := i + 1; e != tr.Len() {
 			t.Fatalf("expected length %d, but found %d", e, tr.Len())
 		}
 		checkIter(t, tr.MakeIter(), count-i, count+1)
 	}
+
 	// Delete keys in reverse sorted order.
 	for i := 0; i < count; i++ {
-		tr.Delete(newCmd(key(count - i)))
-		tr.Verify()
+		tr.Delete(newCmd(span(count - i)))
+		tr.Verify(t)
 		if e := count - (i + 1); e != tr.Len() {
 			t.Fatalf("expected length %d, but found %d", e, tr.Len())
 		}
@@ -158,131 +288,436 @@ func TestBTreeSeek(t *testing.T) {
 
 	var tr btree
 	for i := 0; i < count; i++ {
-		tr.Set(newCmd(key(i * 2)))
+		tr.Set(newCmd(span(i * 2)))
 	}
 
 	it := tr.MakeIter()
 	for i := 0; i < 2*count-1; i++ {
-		it.SeekGE(newCmd(key(i)))
+		it.SeekGE(newCmd(span(i)))
 		if !it.Valid() {
 			t.Fatalf("%d: expected valid iterator", i)
 		}
 		cmd := it.Cmd()
-		expected := key(2 * ((i + 1) / 2))
-		if !expected.Equal(cmd.span.Key) {
-			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span.Key)
+		expected := span(2 * ((i + 1) / 2))
+		if !expected.Equal(cmd.span) {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span)
 		}
 	}
-	it.SeekGE(newCmd(key(2*count - 1)))
+	it.SeekGE(newCmd(span(2*count - 1)))
 	if it.Valid() {
 		t.Fatalf("expected invalid iterator")
 	}
 
 	for i := 1; i < 2*count; i++ {
-		it.SeekLT(newCmd(key(i)))
+		it.SeekLT(newCmd(span(i)))
 		if !it.Valid() {
 			t.Fatalf("%d: expected valid iterator", i)
 		}
 		cmd := it.Cmd()
-		expected := key(2 * ((i - 1) / 2))
-		if !expected.Equal(cmd.span.Key) {
-			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span.Key)
+		expected := span(2 * ((i - 1) / 2))
+		if !expected.Equal(cmd.span) {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span)
 		}
 	}
-	it.SeekLT(newCmd(key(0)))
+	it.SeekLT(newCmd(span(0)))
 	if it.Valid() {
 		t.Fatalf("expected invalid iterator")
 	}
 }
 
-func randomKey(rng *rand.Rand, b []byte) []byte {
-	key := rng.Uint32()
-	key2 := rng.Uint32()
-	binary.LittleEndian.PutUint32(b, key)
-	binary.LittleEndian.PutUint32(b[4:], key2)
-	return b
+func TestBTreeSeekOverlap(t *testing.T) {
+	const count = 513
+	const size = 2 * maxCmds
+
+	var tr btree
+	for i := 0; i < count; i++ {
+		tr.Set(newCmd(spanWithEnd(i, i+size+1)))
+	}
+
+	// Iterate over overlaps with a point scan.
+	it := tr.MakeIter()
+	for i := 0; i < count+size; i++ {
+		it.FirstOverlap(newCmd(spanWithEnd(i, i)))
+		for j := 0; j < size+1; j++ {
+			expStart := i - size + j
+			if expStart < 0 {
+				continue
+			}
+			if expStart >= count {
+				continue
+			}
+
+			if !it.Valid() {
+				t.Fatalf("%d/%d: expected valid iterator", i, j)
+			}
+			cmd := it.Cmd()
+			expected := spanWithEnd(expStart, expStart+size+1)
+			if !expected.Equal(cmd.span) {
+				t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span)
+			}
+
+			it.NextOverlap()
+		}
+		if it.Valid() {
+			t.Fatalf("%d: expected invalid iterator %v", i, it.Cmd())
+		}
+	}
+	it.FirstOverlap(newCmd(span(count + size + 1)))
+	if it.Valid() {
+		t.Fatalf("expected invalid iterator")
+	}
+
+	// Iterate over overlaps with a range scan.
+	it = tr.MakeIter()
+	for i := 0; i < count+size; i++ {
+		it.FirstOverlap(newCmd(spanWithEnd(i, i+size+1)))
+		for j := 0; j < 2*size+1; j++ {
+			expStart := i - size + j
+			if expStart < 0 {
+				continue
+			}
+			if expStart >= count {
+				continue
+			}
+
+			if !it.Valid() {
+				t.Fatalf("%d/%d: expected valid iterator", i, j)
+			}
+			cmd := it.Cmd()
+			expected := spanWithEnd(expStart, expStart+size+1)
+			if !expected.Equal(cmd.span) {
+				t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span)
+			}
+
+			it.NextOverlap()
+		}
+		if it.Valid() {
+			t.Fatalf("%d: expected invalid iterator %v", i, it.Cmd())
+		}
+	}
+	it.FirstOverlap(newCmd(span(count + size + 1)))
+	if it.Valid() {
+		t.Fatalf("expected invalid iterator")
+	}
 }
 
-func BenchmarkIterSeekGE(b *testing.B) {
-	for _, count := range []int{16, 128, 1024} {
-		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
-			var keys [][]byte
-			var tr btree
+func TestBTreeSeekOverlapRandom(t *testing.T) {
+	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
 
-			for i := 0; i < count; i++ {
-				key := []byte(fmt.Sprintf("%05d", i))
-				keys = append(keys, key)
-				tr.Set(newCmd(key))
+	const trials = 10
+	for i := 0; i < trials; i++ {
+		var tr btree
+
+		const count = 1000
+		cmds := make([]*cmd, count)
+		cmdSpans := make([]int, count)
+		for j := 0; j < count; j++ {
+			var cmd *cmd
+			end := rng.Intn(count + 10)
+			if end <= j {
+				end = j
+				cmd = newCmd(spanWithEnd(j, end))
+			} else {
+				cmd = newCmd(spanWithEnd(j, end+1))
 			}
+			tr.Set(cmd)
+			cmds[j] = cmd
+			cmdSpans[j] = end
+		}
 
-			rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
-			it := tr.MakeIter()
+		const scanTrials = 100
+		for j := 0; j < scanTrials; j++ {
+			var scanCmd *cmd
+			scanStart := rng.Intn(count)
+			scanEnd := rng.Intn(count + 10)
+			if scanEnd <= scanStart {
+				scanEnd = scanStart
+				scanCmd = newCmd(spanWithEnd(scanStart, scanEnd))
+			} else {
+				scanCmd = newCmd(spanWithEnd(scanStart, scanEnd+1))
+			}
 
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				k := keys[rng.Intn(len(keys))]
-				it.SeekGE(newCmd(k))
-				if testing.Verbose() {
-					if !it.Valid() {
-						b.Fatal("expected to find key")
-					}
-					if !bytes.Equal(k, it.Cmd().span.Key) {
-						b.Fatalf("expected %s, but found %s", k, it.Cmd().span.Key)
-					}
+			var exp, found []*cmd
+			for startKey, endKey := range cmdSpans {
+				if startKey <= scanEnd && endKey >= scanStart {
+					exp = append(exp, cmds[startKey])
 				}
 			}
+
+			it := tr.MakeIter()
+			it.FirstOverlap(scanCmd)
+			for it.Valid() {
+				found = append(found, it.Cmd())
+				it.NextOverlap()
+			}
+
+			require.Equal(t, len(exp), len(found), "search for %v", scanCmd.span)
+		}
+	}
+}
+
+func TestBTreeCmp(t *testing.T) {
+	testCases := []struct {
+		spanA, spanB roachpb.Span
+		idA, idB     int64
+		exp          int
+	}{
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a")},
+			spanB: roachpb.Span{Key: roachpb.Key("a")},
+			idA:   1,
+			idB:   1,
+			exp:   0,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a")},
+			spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")},
+			idA:   1,
+			idB:   1,
+			exp:   -1,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")},
+			spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")},
+			idA:   1,
+			idB:   1,
+			exp:   1,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")},
+			spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")},
+			idA:   1,
+			idB:   1,
+			exp:   0,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a")},
+			spanB: roachpb.Span{Key: roachpb.Key("a")},
+			idA:   1,
+			idB:   2,
+			exp:   -1,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("a")},
+			spanB: roachpb.Span{Key: roachpb.Key("a")},
+			idA:   2,
+			idB:   1,
+			exp:   1,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("b")},
+			spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")},
+			idA:   1,
+			idB:   1,
+			exp:   1,
+		},
+		{
+			spanA: roachpb.Span{Key: roachpb.Key("b"), EndKey: roachpb.Key("e")},
+			spanB: roachpb.Span{Key: roachpb.Key("c"), EndKey: roachpb.Key("d")},
+			idA:   1,
+			idB:   1,
+			exp:   -1,
+		},
+	}
+	for _, tc := range testCases {
+		name := fmt.Sprintf("cmp(%s:%d,%s:%d)", tc.spanA, tc.idA, tc.spanB, tc.idB)
+		t.Run(name, func(t *testing.T) {
+			cmdA := &cmd{id: tc.idA, span: tc.spanA}
+			cmdB := &cmd{id: tc.idB, span: tc.spanB}
+			require.Equal(t, tc.exp, cmp(cmdA, cmdB))
 		})
 	}
 }
 
-func BenchmarkIterSeekLT(b *testing.B) {
-	for _, count := range []int{16, 128, 1024} {
+//////////////////////////////////////////
+//              Benchmarks              //
+//////////////////////////////////////////
+
+// perm returns a random permutation of cmds with spans in the range [0, n).
+func perm(n int) (out []*cmd) {
+	for _, i := range rand.Perm(n) {
+		out = append(out, newCmd(spanWithEnd(i, i+1)))
+	}
+	return out
+}
+
+func forBenchmarkSizes(b *testing.B, f func(b *testing.B, count int)) {
+	for _, count := range []int{16, 128, 1024, 8192, 65536} {
 		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
-			var keys [][]byte
+			f(b, count)
+		})
+	}
+}
+
+func BenchmarkBTreeInsert(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(count)
+		b.ResetTimer()
+		for i := 0; i < b.N; {
 			var tr btree
+			for _, cmd := range insertP {
+				tr.Set(cmd)
+				i++
+				if i >= b.N {
+					return
+				}
+			}
+		}
+	})
+}
 
-			for i := 0; i < count; i++ {
-				key := []byte(fmt.Sprintf("%05d", i))
-				keys = append(keys, key)
-				tr.Set(newCmd(key))
+func BenchmarkBTreeDelete(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP, removeP := perm(count), perm(count)
+		b.ResetTimer()
+		for i := 0; i < b.N; {
+			b.StopTimer()
+			var tr btree
+			for _, cmd := range insertP {
+				tr.Set(cmd)
+			}
+			b.StartTimer()
+			for _, cmd := range removeP {
+				tr.Delete(cmd)
+				i++
+				if i >= b.N {
+					return
+				}
 			}
+			if tr.Len() > 0 {
+				b.Fatalf("tree not empty: %s", &tr)
+			}
+		}
+	})
+}
 
-			rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
-			it := tr.MakeIter()
+func BenchmarkBTreeDeleteInsert(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(count)
+		var tr btree
+		for _, cmd := range insertP {
+			tr.Set(cmd)
+		}
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			cmd := insertP[i%count]
+			tr.Delete(cmd)
+			tr.Set(cmd)
+		}
+	})
+}
+
+func BenchmarkBTreeIterSeekGE(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		var spans []roachpb.Span
+		var tr btree
+
+		for i := 0; i < count; i++ {
+			s := span(i)
+			spans = append(spans, s)
+			tr.Set(newCmd(s))
+		}
+
+		rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+		it := tr.MakeIter()
 
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				j := rng.Intn(len(keys))
-				k := keys[j]
-				it.SeekLT(newCmd(k))
-				if testing.Verbose() {
-					if j == 0 {
-						if it.Valid() {
-							b.Fatal("unexpected key")
-						}
-					} else {
-						if !it.Valid() {
-							b.Fatal("expected to find key")
-						}
-						k := keys[j-1]
-						if !bytes.Equal(k, it.Cmd().span.Key) {
-							b.Fatalf("expected %s, but found %s", k, it.Cmd().span.Key)
-						}
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			s := spans[rng.Intn(len(spans))]
+			it.SeekGE(newCmd(s))
+			if testing.Verbose() {
+				if !it.Valid() {
+					b.Fatal("expected to find key")
+				}
+				if !s.Equal(it.Cmd().span) {
+					b.Fatalf("expected %s, but found %s", s, it.Cmd().span)
+				}
+			}
+		}
+	})
+}
+
+func BenchmarkBTreeIterSeekLT(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		var spans []roachpb.Span
+		var tr btree
+
+		for i := 0; i < count; i++ {
+			s := span(i)
+			spans = append(spans, s)
+			tr.Set(newCmd(s))
+		}
+
+		rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+		it := tr.MakeIter()
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			j := rng.Intn(len(spans))
+			s := spans[j]
+			it.SeekLT(newCmd(s))
+			if testing.Verbose() {
+				if j == 0 {
+					if it.Valid() {
+						b.Fatal("unexpected key")
+					}
+				} else {
+					if !it.Valid() {
+						b.Fatal("expected to find key")
+					}
+					s := spans[j-1]
+					if !s.Equal(it.Cmd().span) {
+						b.Fatalf("expected %s, but found %s", s, it.Cmd().span)
 					}
 				}
 			}
-		})
-	}
+		}
+	})
 }
 
-func BenchmarkIterNext(b *testing.B) {
-	buf := make([]byte, 64<<10)
+func BenchmarkBTreeIterFirstOverlap(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		var spans []roachpb.Span
+		var cmds []*cmd
+		var tr btree
+
+		for i := 0; i < count; i++ {
+			s := spanWithEnd(i, i+1)
+			spans = append(spans, s)
+			cmd := newCmd(s)
+			cmds = append(cmds, cmd)
+			tr.Set(cmd)
+		}
+
+		rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+		it := tr.MakeIter()
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			j := rng.Intn(len(spans))
+			s := spans[j]
+			cmd := cmds[j]
+			it.FirstOverlap(cmd)
+			if testing.Verbose() {
+				if !it.Valid() {
+					b.Fatal("expected to find key")
+				}
+				if !s.Equal(it.Cmd().span) {
+					b.Fatalf("expected %s, but found %s", s, it.Cmd().span)
+				}
+			}
+		}
+	})
+}
+
+func BenchmarkBTreeIterNext(b *testing.B) {
 	var tr btree
 
-	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
-	for i := 0; i < len(buf); i += 8 {
-		key := randomKey(rng, buf[i:i+8])
-		tr.Set(newCmd(key))
+	const count = 8 << 10
+	const size = 2 * maxCmds
+	for i := 0; i < count; i++ {
+		cmd := newCmd(spanWithEnd(i, i+size+1))
+		tr.Set(cmd)
 	}
 
 	it := tr.MakeIter()
@@ -295,22 +730,65 @@ func BenchmarkIterNext(b *testing.B) {
 	}
 }
 
-func BenchmarkIterPrev(b *testing.B) {
-	buf := make([]byte, 64<<10)
+func BenchmarkBTreeIterPrev(b *testing.B) {
 	var tr btree
 
-	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
-	for i := 0; i < len(buf); i += 8 {
-		key := randomKey(rng, buf[i:i+8])
-		tr.Set(newCmd(key))
+	const count = 8 << 10
+	const size = 2 * maxCmds
+	for i := 0; i < count; i++ {
+		cmd := newCmd(spanWithEnd(i, i+size+1))
+		tr.Set(cmd)
 	}
 
 	it := tr.MakeIter()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		if !it.Valid() {
-			it.Last()
+			it.First()
 		}
 		it.Prev()
 	}
 }
+
+func BenchmarkBTreeIterNextOverlap(b *testing.B) {
+	var tr btree
+
+	const count = 8 << 10
+	const size = 2 * maxCmds
+	for i := 0; i < count; i++ {
+		cmd := newCmd(spanWithEnd(i, i+size+1))
+		tr.Set(cmd)
+	}
+
+	allCmd := newCmd(spanWithEnd(0, count+1))
+	it := tr.MakeIter()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.Valid() {
+			it.FirstOverlap(allCmd)
+		}
+		it.NextOverlap()
+	}
+}
+
+func BenchmarkBTreeIterOverlapScan(b *testing.B) {
+	var tr btree
+	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+
+	const count = 8 << 10
+	const size = 2 * maxCmds
+	for i := 0; i < count; i++ {
+		tr.Set(newCmd(spanWithEnd(i, i+size+1)))
+	}
+
+	cmd := new(cmd)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		cmd.span = randomSpan(rng, count)
+		it := tr.MakeIter()
+		it.FirstOverlap(cmd)
+		for it.Valid() {
+			it.NextOverlap()
+		}
+	}
+}
diff --git a/pkg/util/interval/btree_based_interval_test.go b/pkg/util/interval/btree_based_interval_test.go
index 0af8a8a2c8c4..ab518f2a543c 100644
--- a/pkg/util/interval/btree_based_interval_test.go
+++ b/pkg/util/interval/btree_based_interval_test.go
@@ -555,7 +555,6 @@ func TestSmallTree(t *testing.T) {
 		}
 		checkWithLen(t, tree, i+1)
 	}
-	return
 
 	checkTraversal(t, tree, ivs)
 	checkIterator(t, tree, ivs)
@@ -708,142 +707,217 @@ func TestIterator(t *testing.T) {
 	checkIterator(t, tree, ivs)
 }
 
-const benchmarkTreeSize = 10000
+func forBenchmarkSizes(b *testing.B, f func(b *testing.B, count int)) {
+	for _, count := range []int{16, 128, 1024, 8192, 65536} {
+		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
+			f(b, count)
+		})
+	}
+}
 
 func BenchmarkBTreeInsert(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	b.StartTimer()
-	i := 0
-	for i < b.N {
-		tr := newBTree(InclusiveOverlapper)
-		for _, item := range insertP {
-			tr.Insert(item, false)
-			i++
-			if i >= b.N {
-				return
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
+		b.ResetTimer()
+		i := 0
+		for i < b.N {
+			tr := newBTree(InclusiveOverlapper)
+			for _, item := range insertP {
+				if err := tr.Insert(item, false); err != nil {
+					b.Fatal(err)
+				}
+				i++
+				if i >= b.N {
+					return
+				}
 			}
 		}
-	}
-}
-
-func BenchmarkBTreeDeleteInsert(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	tr := newBTree(InclusiveOverlapper)
-	for _, item := range insertP {
-		tr.Insert(item, false)
-	}
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		tr.Delete(insertP[i%benchmarkTreeSize], false)
-		tr.Insert(insertP[i%benchmarkTreeSize], false)
-	}
-}
-
-func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	tr := newBTree(InclusiveOverlapper)
-	for _, item := range insertP {
-		tr.Insert(item, false)
-	}
-	tr = tr.cloneInternal()
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		tr.Delete(insertP[i%benchmarkTreeSize], false)
-		tr.Insert(insertP[i%benchmarkTreeSize], false)
-	}
+	})
 }
 
-func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	tr := newBTree(InclusiveOverlapper)
-	for _, item := range insertP {
-		tr.Insert(item, false)
-	}
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		tr = tr.cloneInternal()
-		tr.Delete(insertP[i%benchmarkTreeSize], false)
-		tr.Insert(insertP[i%benchmarkTreeSize], false)
-	}
+func BenchmarkBTreeDelete(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP, removeP := perm(uint32(count)), perm(uint32(count))
+		b.ResetTimer()
+		i := 0
+		for i < b.N {
+			b.StopTimer()
+			tr := newBTree(InclusiveOverlapper)
+			for _, item := range insertP {
+				if err := tr.Insert(item, false); err != nil {
+					b.Fatal(err)
+				}
+			}
+			b.StartTimer()
+			for _, item := range removeP {
+				if err := tr.Delete(item, false); err != nil {
+					b.Fatal(err)
+				}
+				i++
+				if i >= b.N {
+					return
+				}
+			}
+			if tr.Len() > 0 {
+				panic(tr.Len())
+			}
+		}
+	})
 }
 
-func BenchmarkBTreeDelete(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	removeP := perm(benchmarkTreeSize)
-	b.StartTimer()
-	i := 0
-	for i < b.N {
-		b.StopTimer()
+func BenchmarkBTreeDeleteInsert(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
 		tr := newBTree(InclusiveOverlapper)
 		for _, item := range insertP {
 			if err := tr.Insert(item, false); err != nil {
 				b.Fatal(err)
 			}
 		}
-		b.StartTimer()
-		for _, item := range removeP {
-			tr.Delete(item, false)
-			i++
-			if i >= b.N {
-				return
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			if err := tr.Delete(insertP[i%count], false); err != nil {
+				b.Fatal(err)
+			}
+			if err := tr.Insert(insertP[i%count], false); err != nil {
+				b.Fatal(err)
 			}
 		}
-		if tr.Len() > 0 {
-			panic(tr.Len())
-		}
-	}
+	})
 }
 
-func BenchmarkBTreeGet(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	removeP := perm(benchmarkTreeSize)
-	b.StartTimer()
-	i := 0
-	for i < b.N {
-		b.StopTimer()
+func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
 		tr := newBTree(InclusiveOverlapper)
 		for _, item := range insertP {
 			if err := tr.Insert(item, false); err != nil {
 				b.Fatal(err)
 			}
 		}
-		b.StartTimer()
-		for _, item := range removeP {
-			tr.Get(item.Range())
-			i++
-			if i >= b.N {
-				return
+		tr = tr.cloneInternal()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			if err := tr.Delete(insertP[i%count], false); err != nil {
+				b.Fatal(err)
+			}
+			if err := tr.Insert(insertP[i%count], false); err != nil {
+				b.Fatal(err)
 			}
 		}
-	}
+	})
 }
 
-func BenchmarkBTreeGetCloneEachTime(b *testing.B) {
-	b.StopTimer()
-	insertP := perm(benchmarkTreeSize)
-	removeP := perm(benchmarkTreeSize)
-	b.StartTimer()
-	i := 0
-	for i < b.N {
-		b.StopTimer()
+func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
 		tr := newBTree(InclusiveOverlapper)
-		for _, v := range insertP {
-			tr.Insert(v, false)
+		for _, item := range insertP {
+			if err := tr.Insert(item, false); err != nil {
+				b.Fatal(err)
+			}
 		}
-		b.StartTimer()
-		for _, item := range removeP {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
 			tr = tr.cloneInternal()
-			tr.Get(item.Range())
-			i++
-			if i >= b.N {
-				return
+			if err := tr.Delete(insertP[i%count], false); err != nil {
+				b.Fatal(err)
+			}
+			if err := tr.Insert(insertP[i%count], false); err != nil {
+				b.Fatal(err)
+			}
+		}
+	})
+}
+
+func BenchmarkBTreeGet(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
+		removeP := perm(uint32(count))
+		b.ResetTimer()
+		i := 0
+		for i < b.N {
+			b.StopTimer()
+			tr := newBTree(InclusiveOverlapper)
+			for _, item := range insertP {
+				if err := tr.Insert(item, false); err != nil {
+					b.Fatal(err)
+				}
+			}
+			b.StartTimer()
+			for _, item := range removeP {
+				tr.Get(item.Range())
+				i++
+				if i >= b.N {
+					return
+				}
 			}
 		}
+	})
+}
+
+func BenchmarkBTreeGetCloneEachTime(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(uint32(count))
+		removeP := perm(uint32(count))
+		b.ResetTimer()
+		i := 0
+		for i < b.N {
+			b.StopTimer()
+			tr := newBTree(InclusiveOverlapper)
+			for _, v := range insertP {
+				if err := tr.Insert(v, false); err != nil {
+					b.Fatal(err)
+				}
+			}
+			b.StartTimer()
+			for _, item := range removeP {
+				tr = tr.cloneInternal()
+				tr.Get(item.Range())
+				i++
+				if i >= b.N {
+					return
+				}
+			}
+		}
+	})
+}
+
+func key(i int) Comparable {
+	return []byte(fmt.Sprintf("%04d", i))
+}
+
+func rangeWithEnd(start, end int) Range {
+	return Range{Start: key(start), End: key(end)}
+}
+
+func randomRange(rng *rand.Rand, n int) Range {
+	start := rng.Intn(n)
+	end := rng.Intn(n + 1)
+	if end < start {
+		start, end = end, start
+	}
+	return rangeWithEnd(start, end)
+}
+
+func BenchmarkBTreeOverlapScan(b *testing.B) {
+	tr := newBTree(InclusiveOverlapper)
+	rng := rand.New(rand.NewSource(timeutil.Now().UnixNano()))
+
+	const count = 8 << 10
+	const size = 2 * 31
+	for i := 0; i < count; i++ {
+		iv := &Interval{rangeWithEnd(i, i+size+1), uintptr(i)}
+		if err := tr.Insert(iv, false); err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		cmd := randomRange(rng, count)
+		tr.DoMatching(func(e Interface) bool {
+			return false
+		}, cmd)
 	}
 }

From a10a9cee904aa75bfeb0b9b457cce6246052a8ef Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Thu, 8 Nov 2018 19:51:31 -0500
Subject: [PATCH 10/11] storage/cmdq: remove parent pointers and position
 fields

This commit rips out `node.parent` and `node.pos`. These won't work
with a copy-on-write strategy. This makes the iterator code a little
more complicated because the iterator now needs to maintain a stack
of nodes and positions.

We avoid any allocations for this stack for trees up to height 4 (~130k
items) Because of this, the change does not seem to have any effect on
the benchmarks. Theoretically, it should actually speed up mutations
to the tree.

Release note: None
---
 pkg/storage/cmdq/interval_btree.go      | 141 ++++++++++++------------
 pkg/storage/cmdq/interval_btree_test.go |  15 +--
 2 files changed, 72 insertions(+), 84 deletions(-)

diff --git a/pkg/storage/cmdq/interval_btree.go b/pkg/storage/cmdq/interval_btree.go
index f1b6cc55d4d2..f01e2cbc0f82 100644
--- a/pkg/storage/cmdq/interval_btree.go
+++ b/pkg/storage/cmdq/interval_btree.go
@@ -107,12 +107,10 @@ func upperBound(c *cmd) keyBound {
 }
 
 type leafNode struct {
-	parent *node
-	max    keyBound
-	pos    int16
-	count  int16
-	leaf   bool
-	cmds   [maxCmds]*cmd
+	max   keyBound
+	count int16
+	leaf  bool
+	cmds  [maxCmds]*cmd
 }
 
 func newLeafNode() *node {
@@ -124,25 +122,16 @@ type node struct {
 	children [maxCmds + 1]*node
 }
 
-func (n *node) updatePos(start, end int) {
-	for i := start; i < end; i++ {
-		n.children[i].pos = int16(i)
-	}
-}
-
 func (n *node) insertAt(index int, c *cmd, nd *node) {
 	if index < int(n.count) {
 		copy(n.cmds[index+1:n.count+1], n.cmds[index:n.count])
 		if !n.leaf {
 			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
-			n.updatePos(index+2, int(n.count+2))
 		}
 	}
 	n.cmds[index] = c
 	if !n.leaf {
 		n.children[index+1] = nd
-		nd.parent = n
-		nd.pos = int16(index + 1)
 	}
 	n.count++
 }
@@ -151,8 +140,6 @@ func (n *node) pushBack(c *cmd, nd *node) {
 	n.cmds[n.count] = c
 	if !n.leaf {
 		n.children[n.count+1] = nd
-		nd.parent = n
-		nd.pos = n.count + 1
 	}
 	n.count++
 }
@@ -160,10 +147,7 @@ func (n *node) pushBack(c *cmd, nd *node) {
 func (n *node) pushFront(c *cmd, nd *node) {
 	if !n.leaf {
 		copy(n.children[1:n.count+2], n.children[:n.count+1])
-		n.updatePos(1, int(n.count+2))
 		n.children[0] = nd
-		nd.parent = n
-		nd.pos = 0
 	}
 	copy(n.cmds[1:n.count+1], n.cmds[:n.count])
 	n.cmds[0] = c
@@ -177,7 +161,6 @@ func (n *node) removeAt(index int) (*cmd, *node) {
 	if !n.leaf {
 		child = n.children[index+1]
 		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
-		n.updatePos(index+1, int(n.count))
 		n.children[n.count] = nil
 	}
 	n.count--
@@ -207,7 +190,6 @@ func (n *node) popFront() (*cmd, *node) {
 	if !n.leaf {
 		child = n.children[0]
 		copy(n.children[:n.count+1], n.children[1:n.count+2])
-		n.updatePos(0, int(n.count+1))
 		n.children[n.count+1] = nil
 	}
 	out := n.cmds[0]
@@ -277,10 +259,6 @@ func (n *node) split(i int) (*cmd, *node) {
 		for j := int16(i + 1); j <= n.count; j++ {
 			n.children[j] = nil
 		}
-		for j := int16(0); j <= next.count; j++ {
-			next.children[j].parent = next
-			next.children[j].pos = j
-		}
 	}
 	n.count = int16(i)
 
@@ -493,10 +471,6 @@ func (n *node) rebalanceOrMerge(i int) {
 		copy(child.cmds[child.count+1:], mergeChild.cmds[:mergeChild.count])
 		if !child.leaf {
 			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
-			for i := int16(0); i <= mergeChild.count; i++ {
-				mergeChild.children[i].parent = child
-			}
-			child.updatePos(int(child.count+1), int(child.count+mergeChild.count+2))
 		}
 		child.count += mergeChild.count + 1
 
@@ -593,7 +567,6 @@ func (t *btree) Delete(c *cmd) {
 	}
 	if t.root.count == 0 && !t.root.leaf {
 		t.root = t.root.children[0]
-		t.root.parent = nil
 	}
 }
 
@@ -610,10 +583,6 @@ func (t *btree) Set(c *cmd) {
 		newRoot.children[0] = t.root
 		newRoot.children[1] = splitNode
 		newRoot.max = newRoot.findUpperBound()
-		t.root.parent = newRoot
-		t.root.pos = 0
-		splitNode.parent = newRoot
-		splitNode.pos = 1
 		t.root = newRoot
 	}
 	if replaced, _ := t.root.insert(c); !replaced {
@@ -621,8 +590,9 @@ func (t *btree) Set(c *cmd) {
 	}
 }
 
-// MakeIter returns a new iterator object. Note that it is safe for an
-// iterator to be copied by value.
+// MakeIter returns a new iterator object. It is not safe to continue using an
+// iterator after modifications are made to the tree. If modifications are made,
+// create a new iterator.
 func (t *btree) MakeIter() iterator {
 	return iterator{t: t, pos: -1}
 }
@@ -679,19 +649,53 @@ func (n *node) writeString(b *strings.Builder) {
 
 // iterator is responsible for search and traversal within a btree.
 type iterator struct {
-	t   *btree
+	t    *btree
+	n    *node
+	pos  int16
+	s    []iterFrame
+	sBuf [3]iterFrame // avoids allocation of s up to height 4
+	o    overlapScan
+}
+
+type iterFrame struct {
 	n   *node
 	pos int16
-	o   overlapScan
+}
+
+func (i *iterator) reset() {
+	i.n = i.t.root
+	i.s = i.s[:0]
+	i.pos = -1
+	i.o = overlapScan{}
+}
+
+// descend descends into the node's child at position pos. It maintains a
+// stack of (parent, position) pairs so that the tree can be ascended again
+// later.
+func (i *iterator) descend(n *node, pos int16) {
+	if i.s == nil {
+		i.s = i.sBuf[:0]
+	}
+	i.s = append(i.s, iterFrame{n: n, pos: pos})
+	i.n = n.children[pos]
+	i.pos = 0
+}
+
+// ascend ascends up to the current node's parent and resets the position
+// to the one previously set for this parent node.
+func (i *iterator) ascend() {
+	f := i.s[len(i.s)-1]
+	i.s = i.s[:len(i.s)-1]
+	i.n = f.n
+	i.pos = f.pos
 }
 
 // SeekGE seeks to the first cmd greater-than or equal to the provided cmd.
 func (i *iterator) SeekGE(c *cmd) {
-	i.n = i.t.root
+	i.reset()
 	if i.n == nil {
 		return
 	}
-	i.o = overlapScan{}
 	for {
 		pos, found := i.n.find(c)
 		i.pos = int16(pos)
@@ -704,17 +708,16 @@ func (i *iterator) SeekGE(c *cmd) {
 			}
 			return
 		}
-		i.n = i.n.children[i.pos]
+		i.descend(i.n, i.pos)
 	}
 }
 
 // SeekLT seeks to the first cmd less-than the provided cmd.
 func (i *iterator) SeekLT(c *cmd) {
-	i.n = i.t.root
+	i.reset()
 	if i.n == nil {
 		return
 	}
-	i.o = overlapScan{}
 	for {
 		pos, found := i.n.find(c)
 		i.pos = int16(pos)
@@ -722,33 +725,31 @@ func (i *iterator) SeekLT(c *cmd) {
 			i.Prev()
 			return
 		}
-		i.n = i.n.children[i.pos]
+		i.descend(i.n, i.pos)
 	}
 }
 
 // First seeks to the first cmd in the btree.
 func (i *iterator) First() {
-	i.n = i.t.root
+	i.reset()
 	if i.n == nil {
 		return
 	}
 	for !i.n.leaf {
-		i.n = i.n.children[0]
+		i.descend(i.n, 0)
 	}
-	i.o = overlapScan{}
 	i.pos = 0
 }
 
 // Last seeks to the last cmd in the btree.
 func (i *iterator) Last() {
-	i.n = i.t.root
+	i.reset()
 	if i.n == nil {
 		return
 	}
 	for !i.n.leaf {
-		i.n = i.n.children[i.n.count]
+		i.descend(i.n, i.n.count)
 	}
-	i.o = overlapScan{}
 	i.pos = i.n.count - 1
 }
 
@@ -764,16 +765,15 @@ func (i *iterator) Next() {
 		if i.pos < i.n.count {
 			return
 		}
-		for i.n.parent != nil && i.pos >= i.n.count {
-			i.pos = i.n.pos
-			i.n = i.n.parent
+		for len(i.s) > 0 && i.pos >= i.n.count {
+			i.ascend()
 		}
 		return
 	}
 
-	i.n = i.n.children[i.pos+1]
+	i.descend(i.n, i.pos+1)
 	for !i.n.leaf {
-		i.n = i.n.children[0]
+		i.descend(i.n, 0)
 	}
 	i.pos = 0
 }
@@ -790,16 +790,16 @@ func (i *iterator) Prev() {
 		if i.pos >= 0 {
 			return
 		}
-		for i.n.parent != nil && i.pos < 0 {
-			i.pos = i.n.pos - 1
-			i.n = i.n.parent
+		for len(i.s) > 0 && i.pos < 0 {
+			i.ascend()
+			i.pos--
 		}
 		return
 	}
 
-	i.n = i.n.children[i.pos]
+	i.descend(i.n, i.pos)
 	for !i.n.leaf {
-		i.n = i.n.children[i.n.count]
+		i.descend(i.n, i.n.count)
 	}
 	i.pos = i.n.count - 1
 }
@@ -875,7 +875,7 @@ type overlapScan struct {
 // FirstOverlap seeks to the first cmd in the btree that overlaps with the
 // provided search cmd.
 func (i *iterator) FirstOverlap(c *cmd) {
-	i.n = i.t.root
+	i.reset()
 	if i.n == nil {
 		return
 	}
@@ -902,8 +902,9 @@ func (i *iterator) NextOverlap() {
 }
 
 func (i *iterator) constrainMinSearchBounds() {
+	k := i.o.c.span.Key
 	j := sort.Search(int(i.n.count), func(j int) bool {
-		return bytes.Compare(i.o.c.span.Key, i.n.cmds[j].span.Key) <= 0
+		return bytes.Compare(k, i.n.cmds[j].span.Key) <= 0
 	})
 	i.o.constrMinN = i.n
 	i.o.constrMinPos = int16(j)
@@ -922,23 +923,23 @@ func (i *iterator) findNextOverlap() {
 	for {
 		if i.pos > i.n.count {
 			// Iterate up tree.
-			if i.n.parent == nil {
+			if len(i.s) == 0 {
 				// Should have already hit upper-bound constraint.
 				panic("unreachable")
 			}
-			i.pos = i.n.pos
-			i.n = i.n.parent
+			i.ascend()
 		} else if !i.n.leaf {
 			// Iterate down tree.
 			if i.o.constrMinReached || i.n.children[i.pos].max.contains(i.o.c) {
-				i.n = i.n.children[i.pos]
-				i.pos = 0
+				par := i.n
+				pos := i.pos
+				i.descend(par, pos)
 
 				// Refine the constraint bounds, if necessary.
-				if i.n.parent == i.o.constrMinN && i.n.pos == i.o.constrMinPos {
+				if par == i.o.constrMinN && pos == i.o.constrMinPos {
 					i.constrainMinSearchBounds()
 				}
-				if i.n.parent == i.o.constrMaxN && i.n.pos == i.o.constrMaxPos {
+				if par == i.o.constrMaxN && pos == i.o.constrMaxPos {
 					i.constrainMaxSearchBounds()
 				}
 				continue
diff --git a/pkg/storage/cmdq/interval_btree_test.go b/pkg/storage/cmdq/interval_btree_test.go
index 8f13543c29d0..e544c603d605 100644
--- a/pkg/storage/cmdq/interval_btree_test.go
+++ b/pkg/storage/cmdq/interval_btree_test.go
@@ -35,7 +35,6 @@ func (t *btree) Verify(tt *testing.T) {
 		return
 	}
 	t.verifyLeafSameDepth(tt)
-	t.verifyParentAndPosSet(tt)
 	t.verifyCountAllowed(tt)
 	t.isSorted(tt)
 	t.isUpperBoundCorrect(tt)
@@ -55,18 +54,6 @@ func (n *node) verifyDepthEqualToHeight(t *testing.T, depth, height int) {
 	})
 }
 
-func (t *btree) verifyParentAndPosSet(tt *testing.T) {
-	t.root.verifyParentAndPosSet(tt, nil, 0)
-}
-
-func (n *node) verifyParentAndPosSet(t *testing.T, par *node, pos int16) {
-	require.Equal(t, par, n.parent)
-	require.Equal(t, pos, n.pos)
-	n.recurse(func(child *node, pos int16) {
-		child.verifyParentAndPosSet(t, n, pos)
-	})
-}
-
 func (t *btree) verifyCountAllowed(tt *testing.T) {
 	t.root.verifyCountAllowed(tt, true)
 }
@@ -217,7 +204,7 @@ func checkIter(t *testing.T, it iterator, start, end int) {
 		}
 	}
 	if i != start {
-		t.Fatalf("expected %d, but at %d: %+v parent=%p", start, i, it, it.n.parent)
+		t.Fatalf("expected %d, but at %d: %+v", start, i, it)
 	}
 
 	all := newCmd(spanWithEnd(start, end))

From cc9c2276dff7cd703a68875dcf7f71d4047ccb42 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Wed, 14 Nov 2018 20:53:53 -0500
Subject: [PATCH 11/11] storage/cmdq: prevent iterator from escaping to heap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The iterator returned by `tree.MakeIter` was escaping to the heap
for two reasons:
1. it was capturing a reference to the tree itself.
2. it was pointing a slice into its own array.

This change addresses both of these problems and prevents the iterator
from escaping when used. The fixes were:
1. copy the tree's root pointer reference instead of a reference
   to the tree itself.
2. avoid creating the self-referential slice reference. This mistakenly
   escapes because of https://github.com/golang/go/issues/7921, which
   also caused issues with `bytes.Buffer` (https://golang.org/cl/133715).

This change also adds a new benchmark which demonstrates whether `MakeIter`
escapes or not:

```
name             old time/op    new time/op    delta
BTreeMakeIter-4     131ns ±14%      25ns ± 1%   -81.23%  (p=0.000 n=9+9)

name             old alloc/op   new alloc/op   delta
BTreeMakeIter-4      144B ± 0%        0B       -100.00%  (p=0.000 n=10+10)

name             old allocs/op  new allocs/op  delta
BTreeMakeIter-4      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
```

Release note: None
---
 pkg/storage/cmdq/interval_btree.go      | 88 ++++++++++++++++++-------
 pkg/storage/cmdq/interval_btree_test.go | 24 +++++++
 2 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/pkg/storage/cmdq/interval_btree.go b/pkg/storage/cmdq/interval_btree.go
index f01e2cbc0f82..0f144948eb0a 100644
--- a/pkg/storage/cmdq/interval_btree.go
+++ b/pkg/storage/cmdq/interval_btree.go
@@ -594,7 +594,7 @@ func (t *btree) Set(c *cmd) {
 // iterator after modifications are made to the tree. If modifications are made,
 // create a new iterator.
 func (t *btree) MakeIter() iterator {
-	return iterator{t: t, pos: -1}
+	return iterator{r: t.root, pos: -1}
 }
 
 // Height returns the height of the tree.
@@ -647,36 +647,79 @@ func (n *node) writeString(b *strings.Builder) {
 	}
 }
 
-// iterator is responsible for search and traversal within a btree.
-type iterator struct {
-	t    *btree
-	n    *node
-	pos  int16
+// iterStack represents a stack of (node, pos) tuples, which captures
+// iteration state as an iterator descends a btree.
+type iterStack struct {
+	a    iterStackArr
+	aLen int16 // -1 when using s
 	s    []iterFrame
-	sBuf [3]iterFrame // avoids allocation of s up to height 4
-	o    overlapScan
 }
 
+// Used to avoid allocations for stacks below a certain size.
+type iterStackArr [3]iterFrame
+
 type iterFrame struct {
 	n   *node
 	pos int16
 }
 
+func (is *iterStack) push(f iterFrame) {
+	if is.aLen == -1 {
+		is.s = append(is.s, f)
+	} else if int(is.aLen) == len(is.a) {
+		is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
+		copy(is.s, is.a[:])
+		is.s[int(is.aLen)] = f
+		is.aLen = -1
+	} else {
+		is.a[is.aLen] = f
+		is.aLen++
+	}
+}
+
+func (is *iterStack) pop() iterFrame {
+	if is.aLen == -1 {
+		f := is.s[len(is.s)-1]
+		is.s = is.s[:len(is.s)-1]
+		return f
+	}
+	is.aLen--
+	return is.a[is.aLen]
+}
+
+func (is *iterStack) len() int {
+	if is.aLen == -1 {
+		return len(is.s)
+	}
+	return int(is.aLen)
+}
+
+func (is *iterStack) reset() {
+	if is.aLen == -1 {
+		is.s = is.s[:0]
+	} else {
+		is.aLen = 0
+	}
+}
+
+// iterator is responsible for search and traversal within a btree.
+type iterator struct {
+	r   *node
+	n   *node
+	pos int16
+	s   iterStack
+	o   overlapScan
+}
+
 func (i *iterator) reset() {
-	i.n = i.t.root
-	i.s = i.s[:0]
+	i.n = i.r
 	i.pos = -1
+	i.s.reset()
 	i.o = overlapScan{}
 }
 
-// descend descends into the node's child at position pos. It maintains a
-// stack of (parent, position) pairs so that the tree can be ascended again
-// later.
 func (i *iterator) descend(n *node, pos int16) {
-	if i.s == nil {
-		i.s = i.sBuf[:0]
-	}
-	i.s = append(i.s, iterFrame{n: n, pos: pos})
+	i.s.push(iterFrame{n: n, pos: pos})
 	i.n = n.children[pos]
 	i.pos = 0
 }
@@ -684,8 +727,7 @@ func (i *iterator) descend(n *node, pos int16) {
 // ascend ascends up to the current node's parent and resets the position
 // to the one previously set for this parent node.
 func (i *iterator) ascend() {
-	f := i.s[len(i.s)-1]
-	i.s = i.s[:len(i.s)-1]
+	f := i.s.pop()
 	i.n = f.n
 	i.pos = f.pos
 }
@@ -765,7 +807,7 @@ func (i *iterator) Next() {
 		if i.pos < i.n.count {
 			return
 		}
-		for len(i.s) > 0 && i.pos >= i.n.count {
+		for i.s.len() > 0 && i.pos >= i.n.count {
 			i.ascend()
 		}
 		return
@@ -790,7 +832,7 @@ func (i *iterator) Prev() {
 		if i.pos >= 0 {
 			return
 		}
-		for len(i.s) > 0 && i.pos < 0 {
+		for i.s.len() > 0 && i.pos < 0 {
 			i.ascend()
 			i.pos--
 		}
@@ -923,10 +965,6 @@ func (i *iterator) findNextOverlap() {
 	for {
 		if i.pos > i.n.count {
 			// Iterate up tree.
-			if len(i.s) == 0 {
-				// Should have already hit upper-bound constraint.
-				panic("unreachable")
-			}
 			i.ascend()
 		} else if !i.n.leaf {
 			// Iterate down tree.
diff --git a/pkg/storage/cmdq/interval_btree_test.go b/pkg/storage/cmdq/interval_btree_test.go
index e544c603d605..693787c5d86c 100644
--- a/pkg/storage/cmdq/interval_btree_test.go
+++ b/pkg/storage/cmdq/interval_btree_test.go
@@ -516,6 +516,22 @@ func TestBTreeCmp(t *testing.T) {
 	}
 }
 
+func TestIterStack(t *testing.T) {
+	f := func(i int) iterFrame { return iterFrame{pos: int16(i)} }
+	var is iterStack
+	for i := 1; i <= 2*len(iterStackArr{}); i++ {
+		var j int
+		for j = 0; j < i; j++ {
+			is.push(f(j))
+		}
+		require.Equal(t, j, is.len())
+		for j--; j >= 0; j-- {
+			require.Equal(t, f(j), is.pop())
+		}
+		is.reset()
+	}
+}
+
 //////////////////////////////////////////
 //              Benchmarks              //
 //////////////////////////////////////////
@@ -594,6 +610,14 @@ func BenchmarkBTreeDeleteInsert(b *testing.B) {
 	})
 }
 
+func BenchmarkBTreeMakeIter(b *testing.B) {
+	var tr btree
+	for i := 0; i < b.N; i++ {
+		it := tr.MakeIter()
+		it.First()
+	}
+}
+
 func BenchmarkBTreeIterSeekGE(b *testing.B) {
 	forBenchmarkSizes(b, func(b *testing.B, count int) {
 		var spans []roachpb.Span