diff --git a/pkg/storage/cmdq/interval_btree.go b/pkg/storage/cmdq/interval_btree.go new file mode 100644 index 000000000000..0f144948eb0a --- /dev/null +++ b/pkg/storage/cmdq/interval_btree.go @@ -0,0 +1,1013 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package cmdq + +import ( + "bytes" + "sort" + "strings" + "unsafe" + + "github.com/cockroachdb/cockroach/pkg/roachpb" +) + +// TODO(nvanbenschoten): +// 2. Add synchronized node and leafNode freelists +// 3. Introduce immutability and a copy-on-write policy: +// 4. Describe pedigree, changes, etc. of this implementation + +const ( + degree = 16 + maxCmds = 2*degree - 1 + minCmds = degree - 1 +) + +// TODO(nvanbenschoten): remove. +type cmd struct { + id int64 + span roachpb.Span +} + +// cmp returns a value indicating the sort order relationship between +// a and b. The comparison is performed lexicographically on +// (a.span.Key, a.span.EndKey, a.id) +// and +// (b.span.Key, b.span.EndKey, b.id) +// tuples. +// +// Given c = cmp(a, b): +// +// c == -1 if (a.span.Key, a.span.EndKey, a.id) < (b.span.Key, b.span.EndKey, b.id) +// c == 0 if (a.span.Key, a.span.EndKey, a.id) == (b.span.Key, b.span.EndKey, b.id) +// c == 1 if (a.span.Key, a.span.EndKey, a.id) > (b.span.Key, b.span.EndKey, b.id) +// +func cmp(a, b *cmd) int { + c := bytes.Compare(a.span.Key, b.span.Key) + if c != 0 { + return c + } + c = bytes.Compare(a.span.EndKey, b.span.EndKey) + if c != 0 { + return c + } + if a.id < b.id { + return -1 + } else if a.id > b.id { + return 1 + } else { + return 0 + } +} + +// keyBound represents the upper-bound of a key range. +type keyBound struct { + key roachpb.Key + inc bool +} + +func (b keyBound) compare(o keyBound) int { + c := bytes.Compare(b.key, o.key) + if c != 0 { + return c + } + if b.inc == o.inc { + return 0 + } + if b.inc { + return 1 + } + return -1 +} + +func (b keyBound) contains(a *cmd) bool { + c := bytes.Compare(a.span.Key, b.key) + if c == 0 { + return b.inc + } + return c < 0 +} + +func upperBound(c *cmd) keyBound { + if len(c.span.EndKey) != 0 { + return keyBound{key: c.span.EndKey} + } + return keyBound{key: c.span.Key, inc: true} +} + +type leafNode struct { + max keyBound + count int16 + leaf bool + cmds [maxCmds]*cmd +} + +func newLeafNode() *node { + return (*node)(unsafe.Pointer(&leafNode{leaf: true})) +} + +type node struct { + leafNode + children [maxCmds + 1]*node +} + +func (n *node) insertAt(index int, c *cmd, nd *node) { + if index < int(n.count) { + copy(n.cmds[index+1:n.count+1], n.cmds[index:n.count]) + if !n.leaf { + copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1]) + } + } + n.cmds[index] = c + if !n.leaf { + n.children[index+1] = nd + } + n.count++ +} + +func (n *node) pushBack(c *cmd, nd *node) { + n.cmds[n.count] = c + if !n.leaf { + n.children[n.count+1] = nd + } + n.count++ +} + +func (n *node) pushFront(c *cmd, nd *node) { + if !n.leaf { + copy(n.children[1:n.count+2], n.children[:n.count+1]) + n.children[0] = nd + } + copy(n.cmds[1:n.count+1], n.cmds[:n.count]) + n.cmds[0] = c + n.count++ +} + +// removeAt removes a value at a given index, pulling all subsequent values +// back. +func (n *node) removeAt(index int) (*cmd, *node) { + var child *node + if !n.leaf { + child = n.children[index+1] + copy(n.children[index+1:n.count], n.children[index+2:n.count+1]) + n.children[n.count] = nil + } + n.count-- + out := n.cmds[index] + copy(n.cmds[index:n.count], n.cmds[index+1:n.count+1]) + n.cmds[n.count] = nil + return out, child +} + +// popBack removes and returns the last element in the list. +func (n *node) popBack() (*cmd, *node) { + n.count-- + out := n.cmds[n.count] + n.cmds[n.count] = nil + if n.leaf { + return out, nil + } + child := n.children[n.count+1] + n.children[n.count+1] = nil + return out, child +} + +// popFront removes and returns the first element in the list. +func (n *node) popFront() (*cmd, *node) { + n.count-- + var child *node + if !n.leaf { + child = n.children[0] + copy(n.children[:n.count+1], n.children[1:n.count+2]) + n.children[n.count+1] = nil + } + out := n.cmds[0] + copy(n.cmds[:n.count], n.cmds[1:n.count+1]) + n.cmds[n.count] = nil + return out, child +} + +// find returns the index where the given cmd should be inserted into this +// list. 'found' is true if the cmd already exists in the list at the given +// index. +func (n *node) find(c *cmd) (index int, found bool) { + // Logic copied from sort.Search. Inlining this gave + // an 11% speedup on BenchmarkBTreeDeleteInsert. + i, j := 0, int(n.count) + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + v := cmp(c, n.cmds[h]) + if v == 0 { + return h, true + } else if v > 0 { + i = h + 1 + } else { + j = h + } + } + return i, false +} + +// split splits the given node at the given index. The current node shrinks, +// and this function returns the cmd that existed at that index and a new node +// containing all cmds/children after it. +// +// Before: +// +// +-----------+ +// | x y z | +// +--/-/-\-\--+ +// +// After: +// +// +-----------+ +// | y | +// +----/-\----+ +// / \ +// v v +// +-----------+ +-----------+ +// | x | | z | +// +-----------+ +-----------+ +// +func (n *node) split(i int) (*cmd, *node) { + out := n.cmds[i] + var next *node + if n.leaf { + next = newLeafNode() + } else { + next = &node{} + } + next.count = n.count - int16(i+1) + copy(next.cmds[:], n.cmds[i+1:n.count]) + for j := int16(i); j < n.count; j++ { + n.cmds[j] = nil + } + if !n.leaf { + copy(next.children[:], n.children[i+1:n.count+1]) + for j := int16(i + 1); j <= n.count; j++ { + n.children[j] = nil + } + } + n.count = int16(i) + + next.max = next.findUpperBound() + if n.max.compare(next.max) != 0 && n.max.compare(upperBound(out)) != 0 { + // If upper bound wasn't from new node or cmd + // at index i, it must still be from old node. + } else { + n.max = n.findUpperBound() + } + return out, next +} + +// insert inserts a cmd into the subtree rooted at this node, making sure no +// nodes in the subtree exceed maxCmds cmds. Returns true if an existing cmd was +// replaced and false if a command was inserted. Also returns whether the node's +// upper bound changes. +func (n *node) insert(c *cmd) (replaced, newBound bool) { + i, found := n.find(c) + if found { + n.cmds[i] = c + return true, false + } + if n.leaf { + n.insertAt(i, c, nil) + return false, n.adjustUpperBoundOnInsertion(c, nil) + } + if n.children[i].count >= maxCmds { + splitcmd, splitNode := n.children[i].split(maxCmds / 2) + n.insertAt(i, splitcmd, splitNode) + + switch cmp := cmp(c, n.cmds[i]); { + case cmp < 0: + // no change, we want first split node + case cmp > 0: + i++ // we want second split node + default: + n.cmds[i] = c + return true, false + } + } + replaced, newBound = n.children[i].insert(c) + if newBound { + newBound = n.adjustUpperBoundOnInsertion(c, nil) + } + return replaced, newBound +} + +// removeMax removes and returns the maximum cmd from the subtree rooted at +// this node. +func (n *node) removeMax() *cmd { + if n.leaf { + n.count-- + out := n.cmds[n.count] + n.cmds[n.count] = nil + n.adjustUpperBoundOnRemoval(out, nil) + return out + } + child := n.children[n.count] + if child.count <= minCmds { + n.rebalanceOrMerge(int(n.count)) + return n.removeMax() + } + return child.removeMax() +} + +// remove removes a cmd from the subtree rooted at this node. Returns +// the cmd that was removed or nil if no matching command was found. +// Also returns whether the node's upper bound changes. +func (n *node) remove(c *cmd) (out *cmd, newBound bool) { + i, found := n.find(c) + if n.leaf { + if found { + out, _ = n.removeAt(i) + return out, n.adjustUpperBoundOnRemoval(out, nil) + } + return nil, false + } + child := n.children[i] + if child.count <= minCmds { + // Child not large enough to remove from. + n.rebalanceOrMerge(i) + return n.remove(c) + } + if found { + // Replace the cmd being removed with the max cmd in our left child. + out = n.cmds[i] + n.cmds[i] = child.removeMax() + return out, n.adjustUpperBoundOnRemoval(out, nil) + } + // Cmd is not in this node and child is large enough to remove from. + out, newBound = child.remove(c) + if newBound { + newBound = n.adjustUpperBoundOnRemoval(out, nil) + } + return out, newBound +} + +// rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove +// a cmd from it while keeping it at or above minCmds. +func (n *node) rebalanceOrMerge(i int) { + switch { + case i > 0 && n.children[i-1].count > minCmds: + // Rebalance from left sibling. + // + // +-----------+ + // | y | + // +----/-\----+ + // / \ + // v v + // +-----------+ +-----------+ + // | x | | | + // +----------\+ +-----------+ + // \ + // v + // a + // + // After: + // + // +-----------+ + // | x | + // +----/-\----+ + // / \ + // v v + // +-----------+ +-----------+ + // | | | y | + // +-----------+ +/----------+ + // / + // v + // a + // + left := n.children[i-1] + child := n.children[i] + xCmd, grandChild := left.popBack() + yCmd := n.cmds[i-1] + child.pushFront(yCmd, grandChild) + n.cmds[i-1] = xCmd + + left.adjustUpperBoundOnRemoval(xCmd, grandChild) + child.adjustUpperBoundOnInsertion(yCmd, grandChild) + + case i < int(n.count) && n.children[i+1].count > minCmds: + // Rebalance from right sibling. + // + // +-----------+ + // | y | + // +----/-\----+ + // / \ + // v v + // +-----------+ +-----------+ + // | | | x | + // +-----------+ +/----------+ + // / + // v + // a + // + // After: + // + // +-----------+ + // | x | + // +----/-\----+ + // / \ + // v v + // +-----------+ +-----------+ + // | y | | | + // +----------\+ +-----------+ + // \ + // v + // a + // + right := n.children[i+1] + child := n.children[i] + xCmd, grandChild := right.popFront() + yCmd := n.cmds[i] + child.pushBack(yCmd, grandChild) + n.cmds[i] = xCmd + + right.adjustUpperBoundOnRemoval(xCmd, grandChild) + child.adjustUpperBoundOnInsertion(yCmd, grandChild) + + default: + // Merge with either the left or right sibling. + // + // +-----------+ + // | u y v | + // +----/-\----+ + // / \ + // v v + // +-----------+ +-----------+ + // | x | | z | + // +-----------+ +-----------+ + // + // After: + // + // +-----------+ + // | u v | + // +-----|-----+ + // | + // v + // +-----------+ + // | x y z | + // +-----------+ + // + if i >= int(n.count) { + i = int(n.count - 1) + } + child := n.children[i] + mergeCmd, mergeChild := n.removeAt(i) + child.cmds[child.count] = mergeCmd + copy(child.cmds[child.count+1:], mergeChild.cmds[:mergeChild.count]) + if !child.leaf { + copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1]) + } + child.count += mergeChild.count + 1 + + child.adjustUpperBoundOnInsertion(mergeCmd, mergeChild) + } +} + +// findUpperBound returns the largest end key node range, assuming that its +// children have correct upper bounds already set. +func (n *node) findUpperBound() keyBound { + var max keyBound + for i := int16(0); i < n.count; i++ { + up := upperBound(n.cmds[i]) + if max.compare(up) < 0 { + max = up + } + } + if !n.leaf { + for i := int16(0); i <= n.count; i++ { + up := n.children[i].max + if max.compare(up) < 0 { + max = up + } + } + } + return max +} + +// adjustUpperBoundOnInsertion adjusts the upper key bound for this node +// given a cmd and an optional child node that was inserted. Returns true +// is the upper bound was changed and false if not. +func (n *node) adjustUpperBoundOnInsertion(c *cmd, child *node) bool { + up := upperBound(c) + if child != nil { + if up.compare(child.max) < 0 { + up = child.max + } + } + if n.max.compare(up) < 0 { + n.max = up + return true + } + return false +} + +// adjustUpperBoundOnRemoval adjusts the upper key bound for this node +// given a cmd and an optional child node that were removed. Returns true +// is the upper bound was changed and false if not. +func (n *node) adjustUpperBoundOnRemoval(c *cmd, child *node) bool { + up := upperBound(c) + if child != nil { + if up.compare(child.max) < 0 { + up = child.max + } + } + if n.max.compare(up) == 0 { + n.max = n.findUpperBound() + return true + } + return false +} + +// btree is an implementation of an augmented interval B-Tree. +// +// btree stores cmds in an ordered structure, allowing easy insertion, +// removal, and iteration. It represents intervals and permits an interval +// search operation following the approach laid out in CLRS, Chapter 14. +// The B-Tree stores cmds in order based on their start key and each B-Tree +// node maintains the upper-bound end key of all cmds in its subtree. +// +// Write operations are not safe for concurrent mutation by multiple +// goroutines, but Read operations are. +type btree struct { + root *node + length int +} + +// Reset removes all cmds from the btree. +func (t *btree) Reset() { + t.root = nil + t.length = 0 +} + +// Silent unused warning. +var _ = (*btree).Reset + +// Delete removes a cmd equal to the passed in cmd from the tree. +func (t *btree) Delete(c *cmd) { + if t.root == nil || t.root.count == 0 { + return + } + if out, _ := t.root.remove(c); out != nil { + t.length-- + } + if t.root.count == 0 && !t.root.leaf { + t.root = t.root.children[0] + } +} + +// Set adds the given cmd to the tree. If a cmd in the tree already equals +// the given one, it is replaced with the new cmd. +func (t *btree) Set(c *cmd) { + if t.root == nil { + t.root = newLeafNode() + } else if t.root.count >= maxCmds { + splitcmd, splitNode := t.root.split(maxCmds / 2) + newRoot := &node{} + newRoot.count = 1 + newRoot.cmds[0] = splitcmd + newRoot.children[0] = t.root + newRoot.children[1] = splitNode + newRoot.max = newRoot.findUpperBound() + t.root = newRoot + } + if replaced, _ := t.root.insert(c); !replaced { + t.length++ + } +} + +// MakeIter returns a new iterator object. It is not safe to continue using an +// iterator after modifications are made to the tree. If modifications are made, +// create a new iterator. +func (t *btree) MakeIter() iterator { + return iterator{r: t.root, pos: -1} +} + +// Height returns the height of the tree. +func (t *btree) Height() int { + if t.root == nil { + return 0 + } + h := 1 + n := t.root + for !n.leaf { + n = n.children[0] + h++ + } + return h +} + +// Len returns the number of cmds currently in the tree. +func (t *btree) Len() int { + return t.length +} + +// String returns a string description of the tree. The format is +// similar to the https://en.wikipedia.org/wiki/Newick_format. +func (t *btree) String() string { + if t.length == 0 { + return ";" + } + var b strings.Builder + t.root.writeString(&b) + return b.String() +} + +func (n *node) writeString(b *strings.Builder) { + if n.leaf { + for i := int16(0); i < n.count; i++ { + if i != 0 { + b.WriteString(",") + } + b.WriteString(n.cmds[i].span.String()) + } + return + } + for i := int16(0); i <= n.count; i++ { + b.WriteString("(") + n.children[i].writeString(b) + b.WriteString(")") + if i < n.count { + b.WriteString(n.cmds[i].span.String()) + } + } +} + +// iterStack represents a stack of (node, pos) tuples, which captures +// iteration state as an iterator descends a btree. +type iterStack struct { + a iterStackArr + aLen int16 // -1 when using s + s []iterFrame +} + +// Used to avoid allocations for stacks below a certain size. +type iterStackArr [3]iterFrame + +type iterFrame struct { + n *node + pos int16 +} + +func (is *iterStack) push(f iterFrame) { + if is.aLen == -1 { + is.s = append(is.s, f) + } else if int(is.aLen) == len(is.a) { + is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen)) + copy(is.s, is.a[:]) + is.s[int(is.aLen)] = f + is.aLen = -1 + } else { + is.a[is.aLen] = f + is.aLen++ + } +} + +func (is *iterStack) pop() iterFrame { + if is.aLen == -1 { + f := is.s[len(is.s)-1] + is.s = is.s[:len(is.s)-1] + return f + } + is.aLen-- + return is.a[is.aLen] +} + +func (is *iterStack) len() int { + if is.aLen == -1 { + return len(is.s) + } + return int(is.aLen) +} + +func (is *iterStack) reset() { + if is.aLen == -1 { + is.s = is.s[:0] + } else { + is.aLen = 0 + } +} + +// iterator is responsible for search and traversal within a btree. +type iterator struct { + r *node + n *node + pos int16 + s iterStack + o overlapScan +} + +func (i *iterator) reset() { + i.n = i.r + i.pos = -1 + i.s.reset() + i.o = overlapScan{} +} + +func (i *iterator) descend(n *node, pos int16) { + i.s.push(iterFrame{n: n, pos: pos}) + i.n = n.children[pos] + i.pos = 0 +} + +// ascend ascends up to the current node's parent and resets the position +// to the one previously set for this parent node. +func (i *iterator) ascend() { + f := i.s.pop() + i.n = f.n + i.pos = f.pos +} + +// SeekGE seeks to the first cmd greater-than or equal to the provided cmd. +func (i *iterator) SeekGE(c *cmd) { + i.reset() + if i.n == nil { + return + } + for { + pos, found := i.n.find(c) + i.pos = int16(pos) + if found { + return + } + if i.n.leaf { + if i.pos == i.n.count { + i.Next() + } + return + } + i.descend(i.n, i.pos) + } +} + +// SeekLT seeks to the first cmd less-than the provided cmd. +func (i *iterator) SeekLT(c *cmd) { + i.reset() + if i.n == nil { + return + } + for { + pos, found := i.n.find(c) + i.pos = int16(pos) + if found || i.n.leaf { + i.Prev() + return + } + i.descend(i.n, i.pos) + } +} + +// First seeks to the first cmd in the btree. +func (i *iterator) First() { + i.reset() + if i.n == nil { + return + } + for !i.n.leaf { + i.descend(i.n, 0) + } + i.pos = 0 +} + +// Last seeks to the last cmd in the btree. +func (i *iterator) Last() { + i.reset() + if i.n == nil { + return + } + for !i.n.leaf { + i.descend(i.n, i.n.count) + } + i.pos = i.n.count - 1 +} + +// Next positions the iterator to the cmd immediately following +// its current position. +func (i *iterator) Next() { + if i.n == nil { + return + } + + if i.n.leaf { + i.pos++ + if i.pos < i.n.count { + return + } + for i.s.len() > 0 && i.pos >= i.n.count { + i.ascend() + } + return + } + + i.descend(i.n, i.pos+1) + for !i.n.leaf { + i.descend(i.n, 0) + } + i.pos = 0 +} + +// Prev positions the iterator to the cmd immediately preceding +// its current position. +func (i *iterator) Prev() { + if i.n == nil { + return + } + + if i.n.leaf { + i.pos-- + if i.pos >= 0 { + return + } + for i.s.len() > 0 && i.pos < 0 { + i.ascend() + i.pos-- + } + return + } + + i.descend(i.n, i.pos) + for !i.n.leaf { + i.descend(i.n, i.n.count) + } + i.pos = i.n.count - 1 +} + +// Valid returns whether the iterator is positioned at a valid position. +func (i *iterator) Valid() bool { + return i.pos >= 0 && i.pos < i.n.count +} + +// Cmd returns the cmd at the iterator's current position. It is illegal +// to call Cmd if the iterator is not valid. +func (i *iterator) Cmd() *cmd { + return i.n.cmds[i.pos] +} + +// An overlap scan is a scan over all cmds that overlap with the provided cmd +// in order of the overlapping cmds' start keys. The goal of the scan is to +// minimize the number of key comparisons performed in total. The algorithm +// operates based on the following two invariants maintained by augmented +// interval btree: +// 1. all cmds are sorted in the btree based on their start key. +// 2. all btree nodes maintain the upper bound end key of all cmds +// in their subtree. +// +// The scan algorithm starts in "unconstrained minimum" and "unconstrained +// maximum" states. To enter a "constrained minimum" state, the scan must reach +// cmds in the tree with start keys above the search range's start key. Because +// cmds in the tree are sorted by start key, once the scan enters the +// "constrained minimum" state it will remain there. To enter a "constrained +// maximum" state, the scan must determine the first child btree node in a given +// subtree that can have cmds with start keys above the search range's end key. +// The scan then remains in the "constrained maximum" state until it traverse +// into this child node, at which point it moves to the "unconstrained maximum" +// state again. +// +// The scan algorithm works like a standard btree forward scan with the +// following augmentations: +// 1. before tranversing the tree, the scan performs a binary search on the +// root node's items to determine a "soft" lower-bound constraint position +// and a "hard" upper-bound constraint position in the root's children. +// 2. when tranversing into a child node in the lower or upper bound constraint +// position, the constraint is refined by searching the child's items. +// 3. the initial traversal down the tree follows the left-most children +// whose upper bound end keys are equal to or greater than the start key +// of the search range. The children followed will be equal to or less +// than the soft lower bound constraint. +// 4. once the initial tranversal completes and the scan is in the left-most +// btree node whose upper bound overlaps the search range, key comparisons +// must be performed with each cmd in the tree. This is necessary because +// any of these cmds may have end keys that cause them to overlap with the +// search range. +// 5. once the scan reaches the lower bound constraint position (the first cmd +// with a start key equal to or greater than the search range's start key), +// it can begin scaning without performing key comparisons. This is allowed +// because all commands from this point forward will have end keys that are +// greater than the search range's start key. +// 6. once the scan reaches the upper bound constraint position, it terminates. +// It does so because the cmd at this position is the first cmd with a start +// key larger than the search range's end key. +type overlapScan struct { + c *cmd // search cmd + + // The "soft" lower-bound constraint. + constrMinN *node + constrMinPos int16 + constrMinReached bool + + // The "hard" upper-bound constraint. + constrMaxN *node + constrMaxPos int16 +} + +// FirstOverlap seeks to the first cmd in the btree that overlaps with the +// provided search cmd. +func (i *iterator) FirstOverlap(c *cmd) { + i.reset() + if i.n == nil { + return + } + i.pos = 0 + i.o = overlapScan{c: c} + i.constrainMinSearchBounds() + i.constrainMaxSearchBounds() + i.findNextOverlap() +} + +// NextOverlap positions the iterator to the cmd immediately following +// its current position that overlaps with the search cmd. +func (i *iterator) NextOverlap() { + if i.n == nil { + return + } + if i.o.c == nil { + // Invalid. Mixed overlap scan with non-overlap scan. + i.pos = i.n.count + return + } + i.pos++ + i.findNextOverlap() +} + +func (i *iterator) constrainMinSearchBounds() { + k := i.o.c.span.Key + j := sort.Search(int(i.n.count), func(j int) bool { + return bytes.Compare(k, i.n.cmds[j].span.Key) <= 0 + }) + i.o.constrMinN = i.n + i.o.constrMinPos = int16(j) +} + +func (i *iterator) constrainMaxSearchBounds() { + up := upperBound(i.o.c) + j := sort.Search(int(i.n.count), func(j int) bool { + return !up.contains(i.n.cmds[j]) + }) + i.o.constrMaxN = i.n + i.o.constrMaxPos = int16(j) +} + +func (i *iterator) findNextOverlap() { + for { + if i.pos > i.n.count { + // Iterate up tree. + i.ascend() + } else if !i.n.leaf { + // Iterate down tree. + if i.o.constrMinReached || i.n.children[i.pos].max.contains(i.o.c) { + par := i.n + pos := i.pos + i.descend(par, pos) + + // Refine the constraint bounds, if necessary. + if par == i.o.constrMinN && pos == i.o.constrMinPos { + i.constrainMinSearchBounds() + } + if par == i.o.constrMaxN && pos == i.o.constrMaxPos { + i.constrainMaxSearchBounds() + } + continue + } + } + + // Check search bounds. + if i.n == i.o.constrMaxN && i.pos == i.o.constrMaxPos { + // Invalid. Past possible overlaps. + i.pos = i.n.count + return + } + if i.n == i.o.constrMinN && i.pos == i.o.constrMinPos { + // The scan reached the soft lower-bound constraint. + i.o.constrMinReached = true + } + + // Iterate across node. + if i.pos < i.n.count { + // Check for overlapping cmd. + if i.o.constrMinReached { + // Fast-path to avoid span comparison. i.o.constrMinReached + // tells us that all cmds have end keys above our search + // span's start key. + return + } + if upperBound(i.n.cmds[i.pos]).contains(i.o.c) { + return + } + } + i.pos++ + } +} diff --git a/pkg/storage/cmdq/interval_btree_test.go b/pkg/storage/cmdq/interval_btree_test.go new file mode 100644 index 000000000000..693787c5d86c --- /dev/null +++ b/pkg/storage/cmdq/interval_btree_test.go @@ -0,0 +1,805 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package cmdq + +import ( + "fmt" + "math/rand" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" +) + +////////////////////////////////////////// +// Invariant verification // +////////////////////////////////////////// + +// Verify asserts that the tree's structural invariants all hold. +func (t *btree) Verify(tt *testing.T) { + if t.root == nil { + return + } + t.verifyLeafSameDepth(tt) + t.verifyCountAllowed(tt) + t.isSorted(tt) + t.isUpperBoundCorrect(tt) +} + +func (t *btree) verifyLeafSameDepth(tt *testing.T) { + h := t.Height() + t.root.verifyDepthEqualToHeight(tt, 1, h) +} + +func (n *node) verifyDepthEqualToHeight(t *testing.T, depth, height int) { + if n.leaf { + require.Equal(t, height, depth, "all leaves should have the same depth as the tree height") + } + n.recurse(func(child *node, _ int16) { + child.verifyDepthEqualToHeight(t, depth+1, height) + }) +} + +func (t *btree) verifyCountAllowed(tt *testing.T) { + t.root.verifyCountAllowed(tt, true) +} + +func (n *node) verifyCountAllowed(t *testing.T, root bool) { + if !root { + require.True(t, n.count >= minCmds, "cmd count %d must be in range [%d,%d]", n.count, minCmds, maxCmds) + require.True(t, n.count <= maxCmds, "cmd count %d must be in range [%d,%d]", n.count, minCmds, maxCmds) + } + for i, cmd := range n.cmds { + if i < int(n.count) { + require.NotNil(t, cmd, "cmd below count") + } else { + require.Nil(t, cmd, "cmd above count") + } + } + if !n.leaf { + for i, child := range n.children { + if i <= int(n.count) { + require.NotNil(t, child, "node below count") + } else { + require.Nil(t, child, "node above count") + } + } + } + n.recurse(func(child *node, _ int16) { + child.verifyCountAllowed(t, false) + }) +} + +func (t *btree) isSorted(tt *testing.T) { + t.root.isSorted(tt) +} + +func (n *node) isSorted(t *testing.T) { + for i := int16(1); i < n.count; i++ { + require.True(t, cmp(n.cmds[i-1], n.cmds[i]) <= 0) + } + if !n.leaf { + for i := int16(0); i < n.count; i++ { + prev := n.children[i] + next := n.children[i+1] + + require.True(t, cmp(prev.cmds[prev.count-1], n.cmds[i]) <= 0) + require.True(t, cmp(n.cmds[i], next.cmds[0]) <= 0) + } + } + n.recurse(func(child *node, _ int16) { + child.isSorted(t) + }) +} + +func (t *btree) isUpperBoundCorrect(tt *testing.T) { + t.root.isUpperBoundCorrect(tt) +} + +func (n *node) isUpperBoundCorrect(t *testing.T) { + require.Equal(t, 0, n.findUpperBound().compare(n.max)) + for i := int16(1); i < n.count; i++ { + require.True(t, upperBound(n.cmds[i]).compare(n.max) <= 0) + } + if !n.leaf { + for i := int16(0); i <= n.count; i++ { + child := n.children[i] + require.True(t, child.max.compare(n.max) <= 0) + } + } + n.recurse(func(child *node, _ int16) { + child.isUpperBoundCorrect(t) + }) +} + +func (n *node) recurse(f func(child *node, pos int16)) { + if !n.leaf { + for i := int16(0); i <= n.count; i++ { + f(n.children[i], i) + } + } +} + +////////////////////////////////////////// +// Unit Tests // +////////////////////////////////////////// + +func key(i int) roachpb.Key { + if i < 0 || i > 99999 { + panic("key out of bounds") + } + return []byte(fmt.Sprintf("%05d", i)) +} + +func span(i int) roachpb.Span { + switch i % 10 { + case 0: + return roachpb.Span{Key: key(i)} + case 1: + return roachpb.Span{Key: key(i), EndKey: key(i).Next()} + case 2: + return roachpb.Span{Key: key(i), EndKey: key(i + 64)} + default: + return roachpb.Span{Key: key(i), EndKey: key(i + 4)} + } +} + +func spanWithEnd(start, end int) roachpb.Span { + if start < end { + return roachpb.Span{Key: key(start), EndKey: key(end)} + } else if start == end { + return roachpb.Span{Key: key(start)} + } else { + panic("illegal span") + } +} + +func randomSpan(rng *rand.Rand, n int) roachpb.Span { + start := rng.Intn(n) + end := rng.Intn(n + 1) + if end < start { + start, end = end, start + } + return spanWithEnd(start, end) +} + +func newCmd(s roachpb.Span) *cmd { + return &cmd{span: s} +} + +func checkIter(t *testing.T, it iterator, start, end int) { + i := start + for it.First(); it.Valid(); it.Next() { + cmd := it.Cmd() + expected := span(i) + if !expected.Equal(cmd.span) { + t.Fatalf("expected %s, but found %s", expected, cmd.span) + } + i++ + } + if i != end { + t.Fatalf("expected %d, but at %d", end, i) + } + + for it.Last(); it.Valid(); it.Prev() { + i-- + cmd := it.Cmd() + expected := span(i) + if !expected.Equal(cmd.span) { + t.Fatalf("expected %s, but found %s", expected, cmd.span) + } + } + if i != start { + t.Fatalf("expected %d, but at %d: %+v", start, i, it) + } + + all := newCmd(spanWithEnd(start, end)) + for it.FirstOverlap(all); it.Valid(); it.NextOverlap() { + cmd := it.Cmd() + expected := span(i) + if !expected.Equal(cmd.span) { + t.Fatalf("expected %s, but found %s", expected, cmd.span) + } + i++ + } + if i != end { + t.Fatalf("expected %d, but at %d", end, i) + } +} + +func TestBTree(t *testing.T) { + var tr btree + + // With degree == 16 (max-items/node == 31) we need 513 items in order for + // there to be 3 levels in the tree. The count here is comfortably above + // that. + const count = 768 + + // Add keys in sorted order. + for i := 0; i < count; i++ { + tr.Set(newCmd(span(i))) + tr.Verify(t) + if e := i + 1; e != tr.Len() { + t.Fatalf("expected length %d, but found %d", e, tr.Len()) + } + checkIter(t, tr.MakeIter(), 0, i+1) + } + + // Delete keys in sorted order. + for i := 0; i < count; i++ { + tr.Delete(newCmd(span(i))) + tr.Verify(t) + if e := count - (i + 1); e != tr.Len() { + t.Fatalf("expected length %d, but found %d", e, tr.Len()) + } + checkIter(t, tr.MakeIter(), i+1, count) + } + + // Add keys in reverse sorted order. + for i := 0; i < count; i++ { + tr.Set(newCmd(span(count - i))) + tr.Verify(t) + if e := i + 1; e != tr.Len() { + t.Fatalf("expected length %d, but found %d", e, tr.Len()) + } + checkIter(t, tr.MakeIter(), count-i, count+1) + } + + // Delete keys in reverse sorted order. + for i := 0; i < count; i++ { + tr.Delete(newCmd(span(count - i))) + tr.Verify(t) + if e := count - (i + 1); e != tr.Len() { + t.Fatalf("expected length %d, but found %d", e, tr.Len()) + } + checkIter(t, tr.MakeIter(), 1, count-i) + } +} + +func TestBTreeSeek(t *testing.T) { + const count = 513 + + var tr btree + for i := 0; i < count; i++ { + tr.Set(newCmd(span(i * 2))) + } + + it := tr.MakeIter() + for i := 0; i < 2*count-1; i++ { + it.SeekGE(newCmd(span(i))) + if !it.Valid() { + t.Fatalf("%d: expected valid iterator", i) + } + cmd := it.Cmd() + expected := span(2 * ((i + 1) / 2)) + if !expected.Equal(cmd.span) { + t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span) + } + } + it.SeekGE(newCmd(span(2*count - 1))) + if it.Valid() { + t.Fatalf("expected invalid iterator") + } + + for i := 1; i < 2*count; i++ { + it.SeekLT(newCmd(span(i))) + if !it.Valid() { + t.Fatalf("%d: expected valid iterator", i) + } + cmd := it.Cmd() + expected := span(2 * ((i - 1) / 2)) + if !expected.Equal(cmd.span) { + t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span) + } + } + it.SeekLT(newCmd(span(0))) + if it.Valid() { + t.Fatalf("expected invalid iterator") + } +} + +func TestBTreeSeekOverlap(t *testing.T) { + const count = 513 + const size = 2 * maxCmds + + var tr btree + for i := 0; i < count; i++ { + tr.Set(newCmd(spanWithEnd(i, i+size+1))) + } + + // Iterate over overlaps with a point scan. + it := tr.MakeIter() + for i := 0; i < count+size; i++ { + it.FirstOverlap(newCmd(spanWithEnd(i, i))) + for j := 0; j < size+1; j++ { + expStart := i - size + j + if expStart < 0 { + continue + } + if expStart >= count { + continue + } + + if !it.Valid() { + t.Fatalf("%d/%d: expected valid iterator", i, j) + } + cmd := it.Cmd() + expected := spanWithEnd(expStart, expStart+size+1) + if !expected.Equal(cmd.span) { + t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span) + } + + it.NextOverlap() + } + if it.Valid() { + t.Fatalf("%d: expected invalid iterator %v", i, it.Cmd()) + } + } + it.FirstOverlap(newCmd(span(count + size + 1))) + if it.Valid() { + t.Fatalf("expected invalid iterator") + } + + // Iterate over overlaps with a range scan. + it = tr.MakeIter() + for i := 0; i < count+size; i++ { + it.FirstOverlap(newCmd(spanWithEnd(i, i+size+1))) + for j := 0; j < 2*size+1; j++ { + expStart := i - size + j + if expStart < 0 { + continue + } + if expStart >= count { + continue + } + + if !it.Valid() { + t.Fatalf("%d/%d: expected valid iterator", i, j) + } + cmd := it.Cmd() + expected := spanWithEnd(expStart, expStart+size+1) + if !expected.Equal(cmd.span) { + t.Fatalf("%d: expected %s, but found %s", i, expected, cmd.span) + } + + it.NextOverlap() + } + if it.Valid() { + t.Fatalf("%d: expected invalid iterator %v", i, it.Cmd()) + } + } + it.FirstOverlap(newCmd(span(count + size + 1))) + if it.Valid() { + t.Fatalf("expected invalid iterator") + } +} + +func TestBTreeSeekOverlapRandom(t *testing.T) { + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + + const trials = 10 + for i := 0; i < trials; i++ { + var tr btree + + const count = 1000 + cmds := make([]*cmd, count) + cmdSpans := make([]int, count) + for j := 0; j < count; j++ { + var cmd *cmd + end := rng.Intn(count + 10) + if end <= j { + end = j + cmd = newCmd(spanWithEnd(j, end)) + } else { + cmd = newCmd(spanWithEnd(j, end+1)) + } + tr.Set(cmd) + cmds[j] = cmd + cmdSpans[j] = end + } + + const scanTrials = 100 + for j := 0; j < scanTrials; j++ { + var scanCmd *cmd + scanStart := rng.Intn(count) + scanEnd := rng.Intn(count + 10) + if scanEnd <= scanStart { + scanEnd = scanStart + scanCmd = newCmd(spanWithEnd(scanStart, scanEnd)) + } else { + scanCmd = newCmd(spanWithEnd(scanStart, scanEnd+1)) + } + + var exp, found []*cmd + for startKey, endKey := range cmdSpans { + if startKey <= scanEnd && endKey >= scanStart { + exp = append(exp, cmds[startKey]) + } + } + + it := tr.MakeIter() + it.FirstOverlap(scanCmd) + for it.Valid() { + found = append(found, it.Cmd()) + it.NextOverlap() + } + + require.Equal(t, len(exp), len(found), "search for %v", scanCmd.span) + } + } +} + +func TestBTreeCmp(t *testing.T) { + testCases := []struct { + spanA, spanB roachpb.Span + idA, idB int64 + exp int + }{ + { + spanA: roachpb.Span{Key: roachpb.Key("a")}, + spanB: roachpb.Span{Key: roachpb.Key("a")}, + idA: 1, + idB: 1, + exp: 0, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("a")}, + spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")}, + idA: 1, + idB: 1, + exp: -1, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")}, + spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")}, + idA: 1, + idB: 1, + exp: 1, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")}, + spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")}, + idA: 1, + idB: 1, + exp: 0, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("a")}, + spanB: roachpb.Span{Key: roachpb.Key("a")}, + idA: 1, + idB: 2, + exp: -1, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("a")}, + spanB: roachpb.Span{Key: roachpb.Key("a")}, + idA: 2, + idB: 1, + exp: 1, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("b")}, + spanB: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("c")}, + idA: 1, + idB: 1, + exp: 1, + }, + { + spanA: roachpb.Span{Key: roachpb.Key("b"), EndKey: roachpb.Key("e")}, + spanB: roachpb.Span{Key: roachpb.Key("c"), EndKey: roachpb.Key("d")}, + idA: 1, + idB: 1, + exp: -1, + }, + } + for _, tc := range testCases { + name := fmt.Sprintf("cmp(%s:%d,%s:%d)", tc.spanA, tc.idA, tc.spanB, tc.idB) + t.Run(name, func(t *testing.T) { + cmdA := &cmd{id: tc.idA, span: tc.spanA} + cmdB := &cmd{id: tc.idB, span: tc.spanB} + require.Equal(t, tc.exp, cmp(cmdA, cmdB)) + }) + } +} + +func TestIterStack(t *testing.T) { + f := func(i int) iterFrame { return iterFrame{pos: int16(i)} } + var is iterStack + for i := 1; i <= 2*len(iterStackArr{}); i++ { + var j int + for j = 0; j < i; j++ { + is.push(f(j)) + } + require.Equal(t, j, is.len()) + for j--; j >= 0; j-- { + require.Equal(t, f(j), is.pop()) + } + is.reset() + } +} + +////////////////////////////////////////// +// Benchmarks // +////////////////////////////////////////// + +// perm returns a random permutation of cmds with spans in the range [0, n). +func perm(n int) (out []*cmd) { + for _, i := range rand.Perm(n) { + out = append(out, newCmd(spanWithEnd(i, i+1))) + } + return out +} + +func forBenchmarkSizes(b *testing.B, f func(b *testing.B, count int)) { + for _, count := range []int{16, 128, 1024, 8192, 65536} { + b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) { + f(b, count) + }) + } +} + +func BenchmarkBTreeInsert(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(count) + b.ResetTimer() + for i := 0; i < b.N; { + var tr btree + for _, cmd := range insertP { + tr.Set(cmd) + i++ + if i >= b.N { + return + } + } + } + }) +} + +func BenchmarkBTreeDelete(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP, removeP := perm(count), perm(count) + b.ResetTimer() + for i := 0; i < b.N; { + b.StopTimer() + var tr btree + for _, cmd := range insertP { + tr.Set(cmd) + } + b.StartTimer() + for _, cmd := range removeP { + tr.Delete(cmd) + i++ + if i >= b.N { + return + } + } + if tr.Len() > 0 { + b.Fatalf("tree not empty: %s", &tr) + } + } + }) +} + +func BenchmarkBTreeDeleteInsert(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(count) + var tr btree + for _, cmd := range insertP { + tr.Set(cmd) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + cmd := insertP[i%count] + tr.Delete(cmd) + tr.Set(cmd) + } + }) +} + +func BenchmarkBTreeMakeIter(b *testing.B) { + var tr btree + for i := 0; i < b.N; i++ { + it := tr.MakeIter() + it.First() + } +} + +func BenchmarkBTreeIterSeekGE(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + var spans []roachpb.Span + var tr btree + + for i := 0; i < count; i++ { + s := span(i) + spans = append(spans, s) + tr.Set(newCmd(s)) + } + + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + it := tr.MakeIter() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + s := spans[rng.Intn(len(spans))] + it.SeekGE(newCmd(s)) + if testing.Verbose() { + if !it.Valid() { + b.Fatal("expected to find key") + } + if !s.Equal(it.Cmd().span) { + b.Fatalf("expected %s, but found %s", s, it.Cmd().span) + } + } + } + }) +} + +func BenchmarkBTreeIterSeekLT(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + var spans []roachpb.Span + var tr btree + + for i := 0; i < count; i++ { + s := span(i) + spans = append(spans, s) + tr.Set(newCmd(s)) + } + + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + it := tr.MakeIter() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + j := rng.Intn(len(spans)) + s := spans[j] + it.SeekLT(newCmd(s)) + if testing.Verbose() { + if j == 0 { + if it.Valid() { + b.Fatal("unexpected key") + } + } else { + if !it.Valid() { + b.Fatal("expected to find key") + } + s := spans[j-1] + if !s.Equal(it.Cmd().span) { + b.Fatalf("expected %s, but found %s", s, it.Cmd().span) + } + } + } + } + }) +} + +func BenchmarkBTreeIterFirstOverlap(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + var spans []roachpb.Span + var cmds []*cmd + var tr btree + + for i := 0; i < count; i++ { + s := spanWithEnd(i, i+1) + spans = append(spans, s) + cmd := newCmd(s) + cmds = append(cmds, cmd) + tr.Set(cmd) + } + + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + it := tr.MakeIter() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + j := rng.Intn(len(spans)) + s := spans[j] + cmd := cmds[j] + it.FirstOverlap(cmd) + if testing.Verbose() { + if !it.Valid() { + b.Fatal("expected to find key") + } + if !s.Equal(it.Cmd().span) { + b.Fatalf("expected %s, but found %s", s, it.Cmd().span) + } + } + } + }) +} + +func BenchmarkBTreeIterNext(b *testing.B) { + var tr btree + + const count = 8 << 10 + const size = 2 * maxCmds + for i := 0; i < count; i++ { + cmd := newCmd(spanWithEnd(i, i+size+1)) + tr.Set(cmd) + } + + it := tr.MakeIter() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !it.Valid() { + it.First() + } + it.Next() + } +} + +func BenchmarkBTreeIterPrev(b *testing.B) { + var tr btree + + const count = 8 << 10 + const size = 2 * maxCmds + for i := 0; i < count; i++ { + cmd := newCmd(spanWithEnd(i, i+size+1)) + tr.Set(cmd) + } + + it := tr.MakeIter() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !it.Valid() { + it.First() + } + it.Prev() + } +} + +func BenchmarkBTreeIterNextOverlap(b *testing.B) { + var tr btree + + const count = 8 << 10 + const size = 2 * maxCmds + for i := 0; i < count; i++ { + cmd := newCmd(spanWithEnd(i, i+size+1)) + tr.Set(cmd) + } + + allCmd := newCmd(spanWithEnd(0, count+1)) + it := tr.MakeIter() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !it.Valid() { + it.FirstOverlap(allCmd) + } + it.NextOverlap() + } +} + +func BenchmarkBTreeIterOverlapScan(b *testing.B) { + var tr btree + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + + const count = 8 << 10 + const size = 2 * maxCmds + for i := 0; i < count; i++ { + tr.Set(newCmd(spanWithEnd(i, i+size+1))) + } + + cmd := new(cmd) + b.ResetTimer() + for i := 0; i < b.N; i++ { + cmd.span = randomSpan(rng, count) + it := tr.MakeIter() + it.FirstOverlap(cmd) + for it.Valid() { + it.NextOverlap() + } + } +} diff --git a/pkg/util/interval/btree_based_interval.go b/pkg/util/interval/btree_based_interval.go index b47eaaee39c0..8b464c0c79ad 100644 --- a/pkg/util/interval/btree_based_interval.go +++ b/pkg/util/interval/btree_based_interval.go @@ -12,7 +12,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the License. // -// This code is based on: https://github.com/google/btree +// This code is based on: https://github.com/google/btree. package interval @@ -21,15 +21,62 @@ import ( "sort" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" ) const ( // DefaultBTreeMinimumDegree is the default B-tree minimum degree. Benchmarks // show that the interval tree performs best with this minimum degree. DefaultBTreeMinimumDegree = 32 + // DefaultBTreeFreeListSize is the default size of a B-tree's freelist. + DefaultBTreeFreeListSize = 32 ) -var _ = newBTree +var ( + nilItems = make(items, 16) + nilChildren = make(children, 16) +) + +// FreeList represents a free list of btree nodes. By default each +// BTree has its own FreeList, but multiple BTrees can share the same +// FreeList. +// Two Btrees using the same freelist are safe for concurrent write access. +type FreeList struct { + mu syncutil.Mutex + freelist []*node +} + +// NewFreeList creates a new free list. +// size is the maximum size of the returned free list. +func NewFreeList(size int) *FreeList { + return &FreeList{freelist: make([]*node, 0, size)} +} + +func (f *FreeList) newNode() (n *node) { + f.mu.Lock() + index := len(f.freelist) - 1 + if index < 0 { + f.mu.Unlock() + return new(node) + } + n = f.freelist[index] + f.freelist[index] = nil + f.freelist = f.freelist[:index] + f.mu.Unlock() + return +} + +// freeNode adds the given node to the list, returning true if it was added +// and false if it was discarded. +func (f *FreeList) freeNode(n *node) (out bool) { + f.mu.Lock() + if len(f.freelist) < cap(f.freelist) { + f.freelist = append(f.freelist, n) + out = true + } + f.mu.Unlock() + return +} // newBTree creates a new interval tree with the given overlapper function and // the default B-tree minimum degree. @@ -47,9 +94,11 @@ func newBTreeWithDegree(overlapper Overlapper, minimumDegree int) *btree { if minimumDegree < 2 { panic("bad minimum degree") } + f := NewFreeList(DefaultBTreeFreeListSize) return &btree{ - MinimumDegree: minimumDegree, - Overlapper: overlapper, + minimumDegree: minimumDegree, + overlapper: overlapper, + cow: ©OnWriteContext{freelist: f}, } } @@ -79,8 +128,8 @@ func (s *items) insertAt(index int, e Interface) { // back. func (s *items) removeAt(index int) Interface { e := (*s)[index] - (*s)[index] = nil copy((*s)[index:], (*s)[index+1:]) + (*s)[len(*s)-1] = nil *s = (*s)[:len(*s)-1] return e } @@ -94,6 +143,16 @@ func (s *items) pop() (out Interface) { return } +// truncate truncates this instance at index so that it contains only the +// first index items. index must be less than or equal to length. +func (s *items) truncate(index int) { + var toClear items + *s, toClear = (*s)[:index], (*s)[index:] + for len(toClear) > 0 { + toClear = toClear[copy(toClear, nilItems):] + } +} + // find returns the index where the given Interface should be inserted into this // list. 'found' is true if the interface already exists in the list at the // given index. @@ -125,8 +184,8 @@ func (s *children) insertAt(index int, n *node) { // back. func (s *children) removeAt(index int) *node { n := (*s)[index] - (*s)[index] = nil copy((*s)[index:], (*s)[index+1:]) + (*s)[len(*s)-1] = nil *s = (*s)[:len(*s)-1] return n } @@ -140,6 +199,16 @@ func (s *children) pop() (out *node) { return } +// truncate truncates this instance at index so that it contains only the +// first index children. index must be less than or equal to length. +func (s *children) truncate(index int) { + var toClear children + *s, toClear = (*s)[:index], (*s)[index:] + for len(toClear) > 0 { + toClear = toClear[copy(toClear, nilChildren):] + } +} + // node is an internal node in a tree. // // It must at all times maintain the invariant that either @@ -155,7 +224,35 @@ type node struct { Range Range items items children children - t *btree + cow *copyOnWriteContext +} + +func (n *node) mutableFor(cow *copyOnWriteContext) *node { + if n.cow == cow { + return n + } + out := cow.newNode() + out.Range = n.Range + if cap(out.items) >= len(n.items) { + out.items = out.items[:len(n.items)] + } else { + out.items = make(items, len(n.items), cap(n.items)) + } + copy(out.items, n.items) + // Copy children + if cap(out.children) >= len(n.children) { + out.children = out.children[:len(n.children)] + } else { + out.children = make(children, len(n.children), cap(n.children)) + } + copy(out.children, n.children) + return out +} + +func (n *node) mutableChild(i int) *node { + c := n.children[i].mutableFor(n.cow) + n.children[i] = c + return c } // split splits the given node at the given index. The current node shrinks, and @@ -179,21 +276,19 @@ type node struct { // func (n *node) split(i int, fast bool) (Interface, *node) { e := n.items[i] - second := n.t.newNode() - second.items = make(items, n.t.minItems()) - copy(second.items, n.items[i+1:]) - n.items = n.items[:i] + second := n.cow.newNode() + second.items = append(second.items, n.items[i+1:]...) + n.items.truncate(i) if len(n.children) > 0 { - second.children = make(children, n.t.minItems()+1) - copy(second.children, n.children[i+1:]) - n.children = n.children[:i+1] + second.children = append(second.children, n.children[i+1:]...) + n.children.truncate(i + 1) } if !fast { // adjust range for the first split part oldRangeEnd := n.Range.End n.Range.End = n.rangeEnd() - // adjust ragne for the second split part + // adjust range for the second split part second.Range.Start = second.rangeStart() if n.Range.End.Equal(oldRangeEnd) || e.Range().End.Equal(oldRangeEnd) { second.Range.End = second.rangeEnd() @@ -206,12 +301,11 @@ func (n *node) split(i int, fast bool) (Interface, *node) { // maybeSplitChild checks if a child should be split, and if so splits it. // Returns whether or not a split occurred. -func (n *node) maybeSplitChild(i int, fast bool) bool { - maxItems := n.t.maxItems() +func (n *node) maybeSplitChild(i, maxItems int, fast bool) bool { if len(n.children[i].items) < maxItems { return false } - first := n.children[i] + first := n.mutableChild(i) e, second := first.split(maxItems/2, fast) n.items.insertAt(i, e) n.children.insertAt(i+1, second) @@ -220,7 +314,7 @@ func (n *node) maybeSplitChild(i int, fast bool) bool { // insert inserts an Interface into the subtree rooted at this node, making sure // no nodes in the subtree exceed maxItems Interfaces. -func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) { +func (n *node) insert(e Interface, maxItems int, fast bool) (out Interface, extended bool) { i, found := n.items.find(e) if found { out = n.items[i] @@ -242,7 +336,7 @@ func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) { } return } - if n.maybeSplitChild(i, fast) { + if n.maybeSplitChild(i, maxItems, fast) { inTree := n.items[i] switch Compare(e, inTree) { case -1: @@ -255,7 +349,7 @@ func (n *node) insert(e Interface, fast bool) (out Interface, extended bool) { return } } - out, extended = n.children[i].insert(e, fast) + out, extended = n.mutableChild(i).insert(e, maxItems, fast) if !fast && extended { extended = false if i == 0 && n.children[0].Range.Start.Compare(n.Range.Start) < 0 { @@ -275,7 +369,7 @@ func (t *btree) isEmpty() bool { } func (t *btree) Get(r Range) (o []Interface) { - return t.GetWithOverlapper(r, t.Overlapper) + return t.GetWithOverlapper(r, t.overlapper) } func (t *btree) GetWithOverlapper(r Range, overlapper Overlapper) (o []Interface) { @@ -296,11 +390,11 @@ func (t *btree) DoMatching(fn Operation, r Range) bool { if !t.overlappable(r) { return false } - return t.root.doMatch(fn, r, t.Overlapper) + return t.root.doMatch(fn, r, t.overlapper) } func (t *btree) overlappable(r Range) bool { - if t.isEmpty() || !t.Overlapper.Overlap(r, t.root.Range) { + if t.isEmpty() || !t.overlapper.Overlap(r, t.root.Range) { return false } return true @@ -464,11 +558,11 @@ func (n *node) remove( panic("invalid remove type") } // If we get to here, we have children. - child := n.children[i] - if len(child.items) <= minItems { + if len(n.children[i].items) <= minItems { out, shrunk = n.growChildAndRemove(i, e, minItems, typ, fast) return } + child := n.mutableChild(i) // Either we had enough interfaces to begin with, or we've done some // merging/stealing, because we've got enough now and we're ready to return // stuff. @@ -603,8 +697,8 @@ func (n *node) growChildAndRemove( // func (n *node) stealFromLeftChild(i int, fast bool) { // steal - stealTo := n.children[i] - stealFrom := n.children[i-1] + stealTo := n.mutableChild(i) + stealFrom := n.mutableChild(i - 1) x := stealFrom.items.pop() y := n.items[i-1] stealTo.items.insertAt(0, y) @@ -660,8 +754,8 @@ func (n *node) stealFromLeftChild(i int, fast bool) { // func (n *node) stealFromRightChild(i int, fast bool) { // steal - stealTo := n.children[i] - stealFrom := n.children[i+1] + stealTo := n.mutableChild(i) + stealFrom := n.mutableChild(i + 1) x := stealFrom.items.removeAt(0) y := n.items[i] stealTo.items = append(stealTo.items, y) @@ -711,21 +805,22 @@ func (n *node) stealFromRightChild(i int, fast bool) { // func (n *node) mergeWithRightChild(i int, fast bool) { // merge - y := n.items.removeAt(i) - child := n.children[i] + child := n.mutableChild(i) + mergeItem := n.items.removeAt(i) mergeChild := n.children.removeAt(i + 1) - child.items = append(child.items, y) + child.items = append(child.items, mergeItem) child.items = append(child.items, mergeChild.items...) child.children = append(child.children, mergeChild.children...) if !fast { - if y.Range().End.Compare(child.Range.End) > 0 { - child.Range.End = y.Range().End + if mergeItem.Range().End.Compare(child.Range.End) > 0 { + child.Range.End = mergeItem.Range().End } if mergeChild.Range.End.Compare(child.Range.End) > 0 { child.Range.End = mergeChild.Range.End } } + n.cow.freeNode(mergeChild) } var _ Tree = (*btree)(nil) @@ -738,10 +833,58 @@ var _ Tree = (*btree)(nil) // Write operations are not safe for concurrent mutation by multiple // goroutines, but Read operations are. type btree struct { - root *node length int - Overlapper Overlapper - MinimumDegree int + minimumDegree int + overlapper Overlapper + root *node + cow *copyOnWriteContext +} + +// copyOnWriteContext pointers determine node ownership... a tree with a write +// context equivalent to a node's write context is allowed to modify that node. +// A tree whose write context does not match a node's is not allowed to modify +// it, and must create a new, writable copy (IE: it's a Clone). +// +// When doing any write operation, we maintain the invariant that the current +// node's context is equal to the context of the tree that requested the write. +// We do this by, before we descend into any node, creating a copy with the +// correct context if the contexts don't match. +// +// Since the node we're currently visiting on any write has the requesting +// tree's context, that node is modifiable in place. Children of that node may +// not share context, but before we descend into them, we'll make a mutable +// copy. +type copyOnWriteContext struct { + freelist *FreeList +} + +// cloneInternal clones the btree, lazily. Clone should not be called concurrently, +// but the original tree (t) and the new tree (t2) can be used concurrently +// once the Clone call completes. +// +// The internal tree structure of b is marked read-only and shared between t and +// t2. Writes to both t and t2 use copy-on-write logic, creating new nodes +// whenever one of b's original nodes would have been modified. Read operations +// should have no performance degredation. Write operations for both t and t2 +// will initially experience minor slow-downs caused by additional allocs and +// copies due to the aforementioned copy-on-write logic, but should converge to +// the original performance characteristics of the original tree. +func (t *btree) cloneInternal() (t2 *btree) { + // Create two entirely new copy-on-write contexts. + // This operation effectively creates three trees: + // the original, shared nodes (old b.cow) + // the new b.cow nodes + // the new out.cow nodes + cow1, cow2 := *t.cow, *t.cow + out := *t + t.cow = &cow1 + out.cow = &cow2 + return &out +} + +// Clone clones the btree, lazily. +func (t *btree) Clone() Tree { + return t.cloneInternal() } // adjustRange sets the Range to the maximum extent of the childrens' Range @@ -791,32 +934,62 @@ func (t *btree) AdjustRanges() { if t.isEmpty() { return } - t.root.adjustRanges() + t.root.adjustRanges(t.root.cow) } -func (n *node) adjustRanges() { - for _, c := range n.children { - c.adjustRanges() +func (n *node) adjustRanges(c *copyOnWriteContext) { + if n.cow != c { + // Could not have been modified. + return + } + for _, child := range n.children { + child.adjustRanges(c) } n.adjustRange() } // maxItems returns the max number of Interfaces to allow per node. func (t *btree) maxItems() int { - return t.MinimumDegree*2 - 1 + return t.minimumDegree*2 - 1 } // minItems returns the min number of Interfaces to allow per node (ignored // for the root node). func (t *btree) minItems() int { - return t.MinimumDegree - 1 + return t.minimumDegree - 1 } -func (t *btree) newNode() (n *node) { - n = &node{t: t} +func (c *copyOnWriteContext) newNode() (n *node) { + n = c.freelist.newNode() + n.cow = c return } +type freeType int + +const ( + ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist) + ftStored // node was stored in the freelist for later use + ftNotOwned // node was ignored by COW, since it's owned by another one +) + +// freeNode frees a node within a given COW context, if it's owned by that +// context. It returns what happened to the node (see freeType const +// documentation). +func (c *copyOnWriteContext) freeNode(n *node) freeType { + if n.cow == c { + // clear to allow GC + n.items.truncate(0) + n.children.truncate(0) + n.cow = nil // clear to allow GC + if c.freelist.freeNode(n) { + return ftStored + } + return ftFreelistFull + } + return ftNotOwned +} + func (t *btree) Insert(e Interface, fast bool) (err error) { // t.metrics("Insert") if err = isValidInterface(e); err != nil { @@ -824,7 +997,7 @@ func (t *btree) Insert(e Interface, fast bool) (err error) { } if t.root == nil { - t.root = t.newNode() + t.root = t.cow.newNode() t.root.items = append(t.root.items, e) t.length++ if !fast { @@ -832,9 +1005,12 @@ func (t *btree) Insert(e Interface, fast bool) (err error) { t.root.Range.End = e.Range().End } return nil - } else if len(t.root.items) >= t.maxItems() { + } + + t.root = t.root.mutableFor(t.cow) + if len(t.root.items) >= t.maxItems() { oldroot := t.root - t.root = t.newNode() + t.root = t.cow.newNode() if !fast { t.root.Range.Start = oldroot.Range.Start t.root.Range.End = oldroot.Range.End @@ -843,7 +1019,8 @@ func (t *btree) Insert(e Interface, fast bool) (err error) { t.root.items = append(t.root.items, e2) t.root.children = append(t.root.children, oldroot, second) } - out, _ := t.root.insert(e, fast) + + out, _ := t.root.insert(e, t.maxItems(), fast) if out == nil { t.length++ } @@ -863,9 +1040,12 @@ func (t *btree) Delete(e Interface, fast bool) (err error) { } func (t *btree) delete(e Interface, typ toRemove, fast bool) Interface { + t.root = t.root.mutableFor(t.cow) out, _ := t.root.remove(e, t.minItems(), typ, fast) if len(t.root.items) == 0 && len(t.root.children) > 0 { + oldroot := t.root t.root = t.root.children[0] + t.cow.freeNode(oldroot) } if out != nil { t.length-- @@ -918,7 +1098,47 @@ func (t *btree) Iterator() TreeIterator { return &ti } +// ClearWithOpt removes all items from the btree. If addNodesToFreelist is +// true, t's nodes are added to its freelist as part of this call, until the +// freelist is full. Otherwise, the root node is simply dereferenced and the +// subtree left to Go's normal GC processes. +// +// This can be much faster than calling Delete on all elements, because that +// requires finding/removing each element in the tree and updating the tree +// accordingly. It also is somewhat faster than creating a new tree to replace +// the old one, because nodes from the old tree are reclaimed into the freelist +// for use by the new one, instead of being lost to the garbage collector. +// +// This call takes: +// O(1): when addNodesToFreelist is false, this is a single operation. +// O(1): when the freelist is already full, it breaks out immediately +// O(freelist size): when the freelist is empty and the nodes are all owned +// by this tree, nodes are added to the freelist until full. +// O(tree size): when all nodes are owned by another tree, all nodes are +// iterated over looking for nodes to add to the freelist, and due to +// ownership, none are. +func (t *btree) ClearWithOpt(addNodesToFreelist bool) { + if t.root != nil && addNodesToFreelist { + t.root.reset(t.cow) + } + t.root, t.length = nil, 0 +} + func (t *btree) Clear() { - t.root = nil - t.length = 0 + t.ClearWithOpt(true) +} + +// reset returns a subtree to the freelist. It breaks out immediately if the +// freelist is full, since the only benefit of iterating is to fill that +// freelist up. Returns true if parent reset call should continue. +func (n *node) reset(c *copyOnWriteContext) bool { + if n.cow != c { + return false + } + for _, child := range n.children { + if !child.reset(c) { + return false + } + } + return c.freeNode(n) != ftFreelistFull } diff --git a/pkg/util/interval/btree_based_interval_test.go b/pkg/util/interval/btree_based_interval_test.go index 387f85ff58d6..ab518f2a543c 100644 --- a/pkg/util/interval/btree_based_interval_test.go +++ b/pkg/util/interval/btree_based_interval_test.go @@ -25,6 +25,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "golang.org/x/sync/errgroup" ) var btreeMinDegree = flag.Int("btree_min_degree", DefaultBTreeMinimumDegree, "B-Tree minimum degree") @@ -54,6 +55,15 @@ func rang(m, n uint32) (out items) { return } +// all extracts all items from a tree in order as a slice. +func all(t *btree) (out items) { + t.Do(func(a Interface) bool { + out = append(out, a) + return false + }) + return +} + func makeMultiByteInterval(start, end, id uint32) *Interval { return &Interval{Range{toBytes(start), toBytes(end)}, uintptr(id)} } @@ -479,7 +489,7 @@ func TestBTree(t *testing.T) { } if len := tree.Len(); len > 0 { - t.Fatalf("expected 0 item, got %d itemes", len) + t.Fatalf("expected 0 item, got %d items", len) } } } @@ -597,6 +607,89 @@ func TestLargeTree(t *testing.T) { checkFastDelete(t, tree, ivs, 10) } +const cloneTestSize = 10000 + +func cloneTest( + t *testing.T, b *btree, start int, p items, g *errgroup.Group, treeC chan *btree, +) error { + t.Logf("Starting new clone at %v", start) + treeC <- b + for i := start; i < cloneTestSize; i++ { + if err := b.Insert(p[i], false); err != nil { + return err + } + if i%(cloneTestSize/5) == 0 { + i := i + c := b.cloneInternal() + g.Go(func() error { + return cloneTest(t, c, i+1, p, g, treeC) + }) + } + } + return nil +} + +func TestCloneConcurrentOperations(t *testing.T) { + var trees []*btree + treeC, treeDone := make(chan *btree), make(chan struct{}) + go func() { + for b := range treeC { + trees = append(trees, b) + } + close(treeDone) + }() + + var g errgroup.Group + b := newBTree(InclusiveOverlapper) + p := perm(cloneTestSize) + g.Go(func() error { + return cloneTest(t, b, 0, p, &g, treeC) + }) + if err := g.Wait(); err != nil { + t.Fatal(err) + } + close(treeC) + <-treeDone + + want := rang(0, cloneTestSize-1) + t.Logf("Starting equality checks on %d trees", len(trees)) + for i, tree := range trees { + if !reflect.DeepEqual(want, all(tree)) { + t.Errorf("tree %v mismatch", i) + } + } + + t.Log("Removing half from first half") + toRemove := want[cloneTestSize/2:] + for i := 0; i < len(trees)/2; i++ { + tree := trees[i] + g.Go(func() error { + for _, item := range toRemove { + if err := tree.Delete(item, false); err != nil { + return err + } + } + return nil + }) + } + if err := g.Wait(); err != nil { + t.Fatal(err) + } + + t.Log("Checking all values again") + for i, tree := range trees { + var wantpart items + if i < len(trees)/2 { + wantpart = want[:cloneTestSize/2] + } else { + wantpart = want + } + if got := all(tree); !reflect.DeepEqual(wantpart, got) { + t.Errorf("tree %v mismatch, want %v got %v", i, len(want), len(got)) + } + } +} + func TestIterator(t *testing.T) { var ivs items const treeSize = 400 @@ -613,3 +706,218 @@ func TestIterator(t *testing.T) { tree.AdjustRanges() checkIterator(t, tree, ivs) } + +func forBenchmarkSizes(b *testing.B, f func(b *testing.B, count int)) { + for _, count := range []int{16, 128, 1024, 8192, 65536} { + b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) { + f(b, count) + }) + } +} + +func BenchmarkBTreeInsert(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + b.ResetTimer() + i := 0 + for i < b.N { + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + i++ + if i >= b.N { + return + } + } + } + }) +} + +func BenchmarkBTreeDelete(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP, removeP := perm(uint32(count)), perm(uint32(count)) + b.ResetTimer() + i := 0 + for i < b.N { + b.StopTimer() + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + } + b.StartTimer() + for _, item := range removeP { + if err := tr.Delete(item, false); err != nil { + b.Fatal(err) + } + i++ + if i >= b.N { + return + } + } + if tr.Len() > 0 { + panic(tr.Len()) + } + } + }) +} + +func BenchmarkBTreeDeleteInsert(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := tr.Delete(insertP[i%count], false); err != nil { + b.Fatal(err) + } + if err := tr.Insert(insertP[i%count], false); err != nil { + b.Fatal(err) + } + } + }) +} + +func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + } + tr = tr.cloneInternal() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := tr.Delete(insertP[i%count], false); err != nil { + b.Fatal(err) + } + if err := tr.Insert(insertP[i%count], false); err != nil { + b.Fatal(err) + } + } + }) +} + +func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + tr = tr.cloneInternal() + if err := tr.Delete(insertP[i%count], false); err != nil { + b.Fatal(err) + } + if err := tr.Insert(insertP[i%count], false); err != nil { + b.Fatal(err) + } + } + }) +} + +func BenchmarkBTreeGet(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + removeP := perm(uint32(count)) + b.ResetTimer() + i := 0 + for i < b.N { + b.StopTimer() + tr := newBTree(InclusiveOverlapper) + for _, item := range insertP { + if err := tr.Insert(item, false); err != nil { + b.Fatal(err) + } + } + b.StartTimer() + for _, item := range removeP { + tr.Get(item.Range()) + i++ + if i >= b.N { + return + } + } + } + }) +} + +func BenchmarkBTreeGetCloneEachTime(b *testing.B) { + forBenchmarkSizes(b, func(b *testing.B, count int) { + insertP := perm(uint32(count)) + removeP := perm(uint32(count)) + b.ResetTimer() + i := 0 + for i < b.N { + b.StopTimer() + tr := newBTree(InclusiveOverlapper) + for _, v := range insertP { + if err := tr.Insert(v, false); err != nil { + b.Fatal(err) + } + } + b.StartTimer() + for _, item := range removeP { + tr = tr.cloneInternal() + tr.Get(item.Range()) + i++ + if i >= b.N { + return + } + } + } + }) +} + +func key(i int) Comparable { + return []byte(fmt.Sprintf("%04d", i)) +} + +func rangeWithEnd(start, end int) Range { + return Range{Start: key(start), End: key(end)} +} + +func randomRange(rng *rand.Rand, n int) Range { + start := rng.Intn(n) + end := rng.Intn(n + 1) + if end < start { + start, end = end, start + } + return rangeWithEnd(start, end) +} + +func BenchmarkBTreeOverlapScan(b *testing.B) { + tr := newBTree(InclusiveOverlapper) + rng := rand.New(rand.NewSource(timeutil.Now().UnixNano())) + + const count = 8 << 10 + const size = 2 * 31 + for i := 0; i < count; i++ { + iv := &Interval{rangeWithEnd(i, i+size+1), uintptr(i)} + if err := tr.Insert(iv, false); err != nil { + b.Fatal(err) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + cmd := randomRange(rng, count) + tr.DoMatching(func(e Interface) bool { + return false + }, cmd) + } +} diff --git a/pkg/util/interval/interval.go b/pkg/util/interval/interval.go index ceef00be3730..67a6ca06c0c7 100644 --- a/pkg/util/interval/interval.go +++ b/pkg/util/interval/interval.go @@ -154,7 +154,7 @@ func Compare(a, b Interface) int { // former has measurably better performance than the latter. So Equal should be used when only // equality state is needed. func Equal(a, b Interface) bool { - return a.Range().Start.Equal(b.Range().Start) && a.ID() == b.ID() + return a.ID() == b.ID() && a.Range().Start.Equal(b.Range().Start) } // A Comparable is a type that describes the ends of a Range. @@ -224,6 +224,8 @@ type Tree interface { Iterator() TreeIterator // Clear this tree. Clear() + // Clone clones the tree, returning a copy. + Clone() Tree } // TreeIterator iterates over all intervals stored in the interval tree, in-order. @@ -234,7 +236,7 @@ type TreeIterator interface { Next() (Interface, bool) } -var useBTreeImpl = envutil.EnvOrDefaultBool("COCKROACH_INTERVAL_BTREE", false) +var useBTreeImpl = envutil.EnvOrDefaultBool("COCKROACH_INTERVAL_BTREE", true) // NewTree creates a new interval tree with the given overlapper function. It // uses the augmented Left-Leaning Red Black tree implementation. diff --git a/pkg/util/interval/llrb_based_interval.go b/pkg/util/interval/llrb_based_interval.go index cfea42c883ba..e2a176959e70 100644 --- a/pkg/util/interval/llrb_based_interval.go +++ b/pkg/util/interval/llrb_based_interval.go @@ -676,3 +676,7 @@ func (t *llrbTree) Clear() { t.Root = nil t.Count = 0 } + +func (t *llrbTree) Clone() Tree { + panic("unimplemented") +}