From ae52503c24dde014f2784e6a39ab81f7b0ab9ab5 Mon Sep 17 00:00:00 2001
From: Bilal Akhtar <bilal@cockroachlabs.com>
Date: Tue, 7 Apr 2020 17:40:49 -0400
Subject: [PATCH] internal/manifest: Add L0SubLevels methods to pick
 compactions

This change adds methods to L0SubLevels to help pick, score,
and generate L0 -> LBase, and L0 -> L0 compactions, based on
information captured in the data structure about L0 sublevels.

These functions will be called from in compaction.go and
compaction_picker.go in a future change.

Also adds associated datadriven unit tests, and a benchmark.

Covers a large part of #563. Thanks Sumeer for his work, most of
this was written by him.
---
 internal/manifest/l0_sublevels.go       | 903 +++++++++++++++++++++++-
 internal/manifest/l0_sublevels_test.go  | 154 +++-
 internal/manifest/testdata/l0_sublevels | 444 +++++++++++-
 3 files changed, 1493 insertions(+), 8 deletions(-)

diff --git a/internal/manifest/l0_sublevels.go b/internal/manifest/l0_sublevels.go
index 415fd9da7f..77a8a98677 100644
--- a/internal/manifest/l0_sublevels.go
+++ b/internal/manifest/l0_sublevels.go
@@ -7,6 +7,7 @@ package manifest
 import (
 	"bytes"
 	"fmt"
+	"math"
 	"sort"
 	"strings"
 
@@ -14,8 +15,7 @@ import (
 	"github.com/cockroachdb/pebble/internal/base"
 )
 
-// TODO(bilal): work items:
-// - Integration with Pebble
+// TODO(bilal): Integrate compaction picking logic with the rest of pebble.
 
 // Intervals are of the form [start, end) with no gap between intervals. Each
 // file overlaps perfectly with a sequence of intervals. This perfect overlap
@@ -444,3 +444,902 @@ func (s *L0SubLevels) MaxDepthAfterOngoingCompactions() int {
 	}
 	return depth
 }
+
+// Only for temporary debugging in the absence of proper tests.
+//
+// TODO(bilal): Simplify away the debugging statements in this method, and make
+// this a pure sanity checker.
+func (s *L0SubLevels) checkCompaction(c *Level0CompactionFiles, isBase bool) error {
+	includedFiles := make([]bool, len(s.filesByAge))
+	fileIntervalsByLevel := make([]struct {
+		min int
+		max int
+	}, len(s.Files))
+	for i := range fileIntervalsByLevel {
+		fileIntervalsByLevel[i].min = math.MaxInt32
+		fileIntervalsByLevel[i].max = 0
+	}
+	var topLevel int
+	var increment int
+	var limitReached func(int) bool
+	if isBase {
+		topLevel = 0
+		increment = -1
+		limitReached = func(level int) bool {
+			return level < 0
+		}
+	} else {
+		topLevel = len(s.Files) - 1
+		increment = +1
+		limitReached = func(level int) bool {
+			return level == len(s.Files)
+		}
+	}
+	for _, f := range c.Files {
+		if f.minIntervalIndex < fileIntervalsByLevel[f.subLevel].min {
+			fileIntervalsByLevel[f.subLevel].min = f.minIntervalIndex
+		}
+		if f.maxIntervalIndex > fileIntervalsByLevel[f.subLevel].max {
+			fileIntervalsByLevel[f.subLevel].max = f.maxIntervalIndex
+		}
+		includedFiles[f.l0Index] = true
+		if isBase {
+			if topLevel < f.subLevel {
+				topLevel = f.subLevel
+			}
+		} else {
+			if topLevel > f.subLevel {
+				topLevel = f.subLevel
+			}
+		}
+	}
+	min := fileIntervalsByLevel[topLevel].min
+	max := fileIntervalsByLevel[topLevel].max
+	for level := topLevel; !limitReached(level); level += increment {
+		if fileIntervalsByLevel[level].min < min {
+			min = fileIntervalsByLevel[level].min
+		}
+		if fileIntervalsByLevel[level].max > max {
+			max = fileIntervalsByLevel[level].max
+		}
+		index := sort.Search(len(s.Files[level]), func(i int) bool {
+			return s.Files[level][i].maxIntervalIndex >= min
+		})
+		// start := index
+		for ; index < len(s.Files[level]); index++ {
+			f := s.Files[level][index]
+			if f.minIntervalIndex > max {
+				break
+			}
+			if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum {
+				return errors.Errorf(
+					"sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d",
+					f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum,
+					f.LargestSeqNum)
+			}
+			if !includedFiles[f.l0Index] {
+				var buf strings.Builder
+				fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n",
+					c.isIntraL0, c.seedInterval, level, index, f.l0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval,
+					f.minIntervalIndex, f.maxIntervalIndex,
+					f.FileNum, f.Compacting, s)
+				fmt.Fprintf(&buf, "files included:\n")
+				for _, f := range c.Files {
+					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
+						f.FileNum, f.subLevel, f.l0Index, f.minIntervalIndex, f.maxIntervalIndex)
+				}
+				fmt.Fprintf(&buf, "files added:\n")
+				for _, f := range c.filesAdded {
+					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
+						f.FileNum, f.subLevel, f.l0Index, f.minIntervalIndex, f.maxIntervalIndex)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// UpdateStateForManualCompaction updates the state of L0SubLevels for a
+// recently started manual compaction. Builds a fake Level0CompactionFiles
+// matching the manual compaction and calls UpdateStateForStartedCompaction.
+// Requires all specified files to be in L0.
+func (s *L0SubLevels) UpdateStateForManualCompaction(files []*FileMetadata) error {
+	i := 0
+	j := 0
+	c := Level0CompactionFiles{
+		Files:         make([]*FileMetadata, 0, len(files)),
+		FilesIncluded: make([]bool, len(s.filesByAge)),
+	}
+	for ; i < len(files) && j < len(s.filesByAge); j++ {
+		f1 := files[i]
+		f2 := s.filesByAge[j]
+		if f1 == f2 {
+			c.addFile(s.filesByAge[j])
+			i++
+		}
+	}
+	if i != len(files) {
+		return errors.Errorf("not all specified files were found in filesByAge: %v", files)
+	}
+	return s.UpdateStateForStartedCompaction(&c, true)
+}
+
+// UpdateStateForStartedCompaction updates internal L0SubLevels state for a
+// recently started compaction. isBase specifies if this is a base compaction;
+// if false, this is assumed to be an intra-L0 compaction. The specified
+// compaction must be involving L0 SSTables.
+func (s *L0SubLevels) UpdateStateForStartedCompaction(c *Level0CompactionFiles, isBase bool) error {
+	if err := s.checkCompaction(c, isBase); err != nil {
+		return err
+	}
+	for _, f := range c.Files {
+		if f.Compacting {
+			return errors.Errorf("L0: %06d already being compacted", f.FileNum)
+		}
+		f.Compacting = true
+		if !isBase {
+			f.IsIntraL0Compacting = true
+		}
+		for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+			interval := &s.orderedIntervals[i]
+			if isBase {
+				interval.isBaseCompacting = true
+			}
+			interval.compactingFileCount++
+		}
+	}
+	if isBase {
+		for i := c.minIntervalIndex; i <= c.maxIntervalIndex; i++ {
+			interval := &s.orderedIntervals[i]
+			for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ {
+				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
+			}
+			// If there is no intra-L0 for this interval, the compacting files
+			// may have encroached on the topOfStackNonCompactingFileCount.
+			if interval.compactingFileCount+interval.topOfStackNonCompactingFileCount > interval.fileCount {
+				interval.topOfStackNonCompactingFileCount = interval.fileCount - interval.compactingFileCount
+			}
+		}
+	} else {
+		for i := c.minIntervalIndex; i <= c.maxIntervalIndex; i++ {
+			interval := &s.orderedIntervals[i]
+			interval.topOfStackNonCompactingFileCount = 0
+			for j := len(interval.subLevelAndFileList) - 1; j >= 0; j-- {
+				fileIndex := interval.subLevelAndFileList[j].fileIndex
+				if s.filesByAge[fileIndex].IsIntraL0Compacting {
+					break
+				}
+				interval.topOfStackNonCompactingFileCount++
+			}
+		}
+	}
+	return nil
+}
+
+// Level0CompactionFiles represents a candidate set of L0 files for compaction.
+// Also referred to as "lcf". Contains state information useful
+// for generating the compaction (such as Files), as well as for picking
+// between candidate compactions (eg. fileBytes and
+// seedIntervalStackDepthReduction).
+type Level0CompactionFiles struct {
+	Files         []*FileMetadata
+	FilesIncluded []bool
+	// A "seed interval" is an interval with a high stack depth that was chosen
+	// to bootstrap this compaction candidate.
+	seedIntervalStackDepthReduction int
+	seedIntervalExtremeLevel        int
+	seedInterval                    int
+	fileBytes                       uint64
+	minIntervalIndex                int
+	maxIntervalIndex                int
+
+	// Set for intra-L0 compactions. SSTables with sequence numbers greater
+	// than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions.
+	isIntraL0               bool
+	earliestUnflushedSeqNum uint64
+
+	// For debugging purposes only. Used in checkCompaction().
+	preExtensionMinInterval int
+	preExtensionMaxInterval int
+	filesAdded              []*FileMetadata
+}
+
+// Adds the specified file to the LCF.
+func (l *Level0CompactionFiles) addFile(f *FileMetadata) {
+	l.FilesIncluded[f.l0Index] = true
+	l.Files = append(l.Files, f)
+	l.filesAdded = append(l.filesAdded, f)
+	l.fileBytes += f.Size
+	if f.minIntervalIndex < l.minIntervalIndex {
+		l.minIntervalIndex = f.minIntervalIndex
+	}
+	if f.maxIntervalIndex > l.maxIntervalIndex {
+		l.maxIntervalIndex = f.maxIntervalIndex
+	}
+}
+
+// Helper to order intervals being considered for compaction.
+type intervalAndScore struct {
+	interval *fileInterval
+	score    int
+}
+type intervalSorterByDecreasingScore []intervalAndScore
+
+func (is intervalSorterByDecreasingScore) Len() int { return len(is) }
+func (is intervalSorterByDecreasingScore) Less(i, j int) bool {
+	return is[i].score > is[j].score
+}
+func (is intervalSorterByDecreasingScore) Swap(i, j int) {
+	is[i], is[j] = is[j], is[i]
+}
+
+// Compactions:
+//
+// The sub-levels and intervals can be visualized in 2 dimensions as the X
+// axis containing intervals in increasing order and the Y axis containing
+// sub-levels (older to younger). The intervals can be sparse wrt sub-levels.
+// We observe that the system is typically under severe pressure in L0 during
+// large imports where most files added to L0 are narrow and non-overlapping.
+// In that case we expect the rectangle represented in the above visualization
+// to be wide and short, and not too sparse (most intervals will have
+// fileCount close to the sub-level count), which would make it amenable to
+// concurrent L0 => Lbase compactions.
+//
+// L0 => Lbase: The high-level goal of a L0 => Lbase compaction is to reduce
+// stack depth, by compacting files in the intervals with the highest
+// (fileCount - compactingCount). Additionally, we would like compactions to
+// not involve a huge number of files, so that they finish quickly, and to
+// allow for concurrent L0 => Lbase compactions when needed. In order to
+// achieve these goals we would like compactions to visualize as capturing
+// thin and tall rectangles. The approach below is to consider intervals in
+// some order and then try to construct a compaction using the interval. The
+// first interval we can construct a compaction for is the compaction that is
+// started. There can be multiple heuristics in choosing the ordering of the
+// intervals -- the code uses one heuristic that worked well for a large import,
+// but additional experimentation is necessary to pick a general heuristic.
+// Additionally, the compaction that gets picked may be not as desirable as one
+// that could be constructed later in terms of reducing stack depth (since
+// adding more files to the compaction can get blocked by needing to encompass
+// files that are already being compacted). So an alternative would be to try to
+// construct more than one compaction and pick the best one.
+//
+// Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to
+// pick a compaction, PickIntraL0Compaction will be used to pick an intra-L0
+// compaction. Similar to L0 => Lbase compactions, we want to allow for
+// multiple intra-L0 compactions and not generate wide output files that
+// hinder later concurrency of L0 => Lbase compactions. Also compactions
+// that produce wide files don't reduce stack depth -- they represent wide
+// rectangles in our visualization, which means many intervals have their
+// depth reduced by a small amount. Typically, L0 files have non-overlapping
+// sequence numbers, and sticking to that invariant would require us to
+// consider intra-L0 compactions that proceed from youngest to oldest files,
+// which could result in the aforementioned undesirable wide rectangle
+// shape. But this non-overlapping sequence number is already relaxed in
+// RocksDB -- sstables are primarily ordered by their largest sequence
+// number. So we can arrange for intra-L0 compactions to capture thin and
+// tall rectangles starting with the top of the stack (youngest files).
+// Like the L0 => Lbase case we order the intervals using a heuristic and
+// consider each in turn. The same comment about better heuristics and not
+// being greedy applies here.
+
+// PickBaseCompaction picks a base compaction based on the above specified
+// heuristics, for the specified Lbase files and a minimum depth of overlapping
+// files that can be selected for compaction. Returns nil if no compaction is
+// possible.
+func (s *L0SubLevels) PickBaseCompaction(
+	minCompactionDepth int, baseFiles []*FileMetadata,
+) (*Level0CompactionFiles, error) {
+	return s.pickCompaction(false, minCompactionDepth, baseFiles, 0)
+}
+
+// Helper function to pick base or intra-L0 compactions. Encapsulates common
+// logic for both Pick{Base,IntraL0}Compaction.
+func (s *L0SubLevels) pickCompaction(
+	isIntraL0 bool, minCompactionDepth int, baseFiles []*FileMetadata, earliestUnflushedSeqNum uint64,
+) (*Level0CompactionFiles, error) {
+	// For LBase compactions, we consider intervals in a greedy manner in the following order:
+	// - pools[0]: Contains intervals that are unlikely to be blocked due
+	//   to ongoing L0 => Lbase compactions. These are the ones with
+	//   !isBaseCompacting && !intervalRangeIsBaseCompacting.
+	// - pools[1]: Contains intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting.
+	//
+	// The ordering heuristic exists just to avoid wasted work. Ideally,
+	// we would consider all intervals with isBaseCompacting = false and
+	// construct a compaction for it and compare the constructed compactions
+	// and pick the best one. If microbenchmarks show that we can afford
+	// this cost we can eliminate this heuristic.
+	//
+	// For Intra-L0 compactions, we only use one pool for all intervals.
+	pools := [][]intervalAndScore{{}}
+	if !isIntraL0 {
+		pools = append(pools, []intervalAndScore{})
+	}
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		depth := interval.fileCount - interval.compactingFileCount
+		if isIntraL0 {
+			if minCompactionDepth > depth || minCompactionDepth > interval.topOfStackNonCompactingFileCount {
+				continue
+			}
+			// TODO(bilal): Is there a way to incorporate
+			// topOfStackNonCompactingFileCount into the score?
+			pools[0] = append(pools[0], intervalAndScore{interval: interval, score: depth})
+		} else {
+			if interval.isBaseCompacting || minCompactionDepth > depth {
+				continue
+			}
+			if interval.intervalRangeIsBaseCompacting {
+				pools[0] = append(pools[0], intervalAndScore{interval: interval, score: depth})
+			} else {
+				pools[1] = append(pools[1], intervalAndScore{interval: interval, score: depth})
+			}
+		}
+	}
+	for _, pool := range pools {
+		sort.Sort(intervalSorterByDecreasingScore(pool))
+	}
+
+	// Optimization to avoid considering different intervals that
+	// are likely to choose the same seed file. Again this is just
+	// to reduce wasted work.
+	consideredIntervals := make([]bool, len(s.orderedIntervals))
+	for _, pool := range pools {
+		for _, interval := range pool {
+			if consideredIntervals[interval.interval.index] {
+				continue
+			}
+
+			var f *FileMetadata
+			if isIntraL0 {
+				// Pick the seed file for the interval as the file
+				// in the highest sub-level.
+				slIndex := len(interval.interval.subLevelAndFileList) - 1
+				adjustedNonCompactingFileCount := interval.interval.topOfStackNonCompactingFileCount
+				for ; slIndex >= 0; slIndex-- {
+					slf := interval.interval.subLevelAndFileList[slIndex]
+					f = s.filesByAge[slf.fileIndex]
+					if f.Compacting {
+						return nil, errors.Errorf("file %d being considered for intra-L0 should not be compacting", f.FileNum)
+					}
+					for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+						consideredIntervals[i] = true
+					}
+					// Can this be the seed file? Files with newer sequence
+					// numbers than earliestUnflushedSeqNum cannot be in
+					// the compaction.
+					if f.LargestSeqNum >= earliestUnflushedSeqNum {
+						adjustedNonCompactingFileCount--
+						if adjustedNonCompactingFileCount == 0 {
+							break
+						}
+					} else {
+						break
+					}
+				}
+				if adjustedNonCompactingFileCount < minCompactionDepth {
+					// Can't use this interval.
+					continue
+				}
+			} else {
+				// Pick the seed file for the interval as the file
+				// in the lowest sub-level.
+				slf := interval.interval.subLevelAndFileList[0]
+				f = s.filesByAge[slf.fileIndex]
+				for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+					// Don't bother considering the intervals that are
+					// covered by the seed file since they are likely
+					// nearby. Note that it is possible that those intervals
+					// have seed files at lower sub-levels so could be
+					// viable for compaction.
+					consideredIntervals[i] = true
+				}
+			}
+
+			if f == nil {
+				return nil, errors.New("no seed file found in sublevel intervals")
+			}
+			if !isIntraL0 && f.Compacting && f.IsIntraL0Compacting {
+				// If we're picking a base compaction and we came across a
+				// seed file candidate that's being intra-L0 compacted, skip
+				// the interval instead of erroring out.
+				continue
+			} else if f.Compacting {
+				// We chose a compaction seed file that should not be
+				// compacting. Usually means the score is not accurately
+				// accounting for files already compacting, or internal state is
+				// inconsistent.
+				return nil, errors.Errorf("file %d chosen as seed file for compaction should not be compacting", f.FileNum)
+			}
+
+			if isIntraL0 {
+				// We have a seed file.
+				c := s.intraL0CompactionUsingSeed(
+					f, interval.interval.index, earliestUnflushedSeqNum, minCompactionDepth)
+				if c != nil {
+					return c, nil
+				}
+			} else {
+				c := s.baseCompactionUsingSeed(f, interval.interval.index, minCompactionDepth)
+				if c != nil {
+					// Check if the chosen compaction overlaps with any files
+					// in Lbase that have Compacting = true. If that's the case,
+					// this compaction cannot be chosen.
+					firstBaseIndex := sort.Search(len(baseFiles), func(i int) bool {
+						// An interval starting at ImmediateSuccessor(key) can never be the
+						// first interval of a compaction since no file can start at that
+						// interval.
+						return s.cmp(baseFiles[i].Largest.UserKey, s.orderedIntervals[c.minIntervalIndex].startKey.key) >= 0
+					})
+					// Exclusive
+					lastBaseIndex := sort.Search(len(baseFiles), func(i int) bool {
+						cmp := s.cmp(baseFiles[i].Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key)
+						// Compaction is ending at exclusive bound of c.maxIntervalIndex+1
+						if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) {
+							return true
+						}
+						return false
+					})
+					baseCompacting := false
+					for j := firstBaseIndex; j < lastBaseIndex; j++ {
+						if baseFiles[j].Compacting {
+							baseCompacting = true
+							break
+						}
+					}
+					if baseCompacting {
+						continue
+					}
+					return c, nil
+				}
+			}
+		}
+	}
+	return nil, nil
+}
+
+// Helper function for building an L0 -> Lbase compaction using a seed interval
+// and seed file in that seed interval.
+func (s *L0SubLevels) baseCompactionUsingSeed(
+	f *FileMetadata, intervalIndex int, minCompactionDepth int,
+) *Level0CompactionFiles {
+	cFiles := &Level0CompactionFiles{
+		Files:                           []*FileMetadata{f},
+		seedInterval:                    intervalIndex,
+		seedIntervalStackDepthReduction: 1,
+		seedIntervalExtremeLevel:        f.subLevel,
+		minIntervalIndex:                f.minIntervalIndex,
+		maxIntervalIndex:                f.maxIntervalIndex,
+		fileBytes:                       f.Size,
+	}
+	sl := f.subLevel
+	filesIncluded := make([]bool, len(s.filesByAge))
+	filesIncluded[f.l0Index] = true
+	cFiles.FilesIncluded = filesIncluded
+	// The seed file captures all files in the next level that fall
+	// in the range of intervals. That may extend the range of
+	// intervals so for correctness we need to capture all files
+	// in the next level that fall in this extended interval and
+	// so on. This can result in a triangular shape like the following
+	// where again the X axis is the key intervals and the Y axis
+	// is oldest to youngest. Note that it is not necessary for
+	// correctness to fill out the shape at the higher sub-levels
+	// to make it more rectangular since the invariant only requires
+	// that younger versions of a key not be moved to Lbase while
+	// leaving behind older versions.
+	//                     -
+	//                    ---
+	//                   -----
+	// It may be better for performance to have a more rectangular
+	// shape since the files being left behind will induce touch the
+	// same Lbase key range as that of this compaction. But there is
+	// also the danger that in trying to construct a more rectangular
+	// shape we will be forced to pull in a file that is already
+	// compacting. We assume that the performance concern is not a
+	// practical issue.
+	for currLevel := sl - 1; currLevel >= 0; currLevel-- {
+		if !s.extendFiles(currLevel, math.MaxUint64, cFiles) {
+			// Failed due to ongoing compaction.
+			return nil
+		}
+	}
+
+	// Now that we have a candidate group of files we can optionally add to it
+	// by stacking more files from intervalIndex and repeating. This is an
+	// optional activity so when it fails we can fallback to the last
+	// successful candidate.
+	lastCandidate := &Level0CompactionFiles{}
+	*lastCandidate = *cFiles
+	slfList := s.orderedIntervals[intervalIndex].subLevelAndFileList
+	slIndex := 1
+	for ; slIndex < len(slfList); slIndex++ {
+		sl := slfList[slIndex].subLevel
+		f2 := s.filesByAge[slfList[slIndex].fileIndex]
+		cFiles.seedIntervalStackDepthReduction++
+		cFiles.seedIntervalExtremeLevel = sl
+		cFiles.addFile(f2)
+		done := false
+		for currLevel := sl - 1; currLevel >= 0; currLevel-- {
+			if !s.extendFiles(currLevel, math.MaxUint64, cFiles) {
+				// Failed to extend due to ongoing compaction.
+				done = true
+				break
+			}
+		}
+		if done {
+			break
+		}
+		// Observed some compactions using > 1GB from L0. Very long running compactions
+		// are not good, though sometimes unavoidable. There is a tradeoff here in
+		// that adding more depth is more efficient in reducing stack depth, but
+		// long running compactions reduce flexibility in what can run concurrently
+		// in L0 and even Lbase => Lbase+1. An increase more than 150%
+		// in bytes since the last candidate compaction, or a total compaction
+		// size beyond a hard limit of 500mb, is criteria for rejecting this
+		// candidate. This lets us prefer slow growths as we add files, while
+		// still having a hard limit.
+		if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
+			cFiles.fileBytes > 100<<20 &&
+			(float64(cFiles.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || cFiles.fileBytes > 500<<20) {
+			break
+		}
+		*lastCandidate = *cFiles
+	}
+	if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
+		for i := range lastCandidate.FilesIncluded {
+			lastCandidate.FilesIncluded[i] = false
+		}
+		for _, f := range lastCandidate.Files {
+			lastCandidate.FilesIncluded[f.l0Index] = true
+		}
+		return lastCandidate
+	}
+	return nil
+}
+
+// Expands fields in the provided Level0CompactionFiles instance (cFiles) to
+// include overlapping files in the specified sublevel. Returns true if the
+// compaction is possible (i.e. does not conflict with any base/intra-L0
+// compacting files).
+func (s *L0SubLevels) extendFiles(
+	sl int, earliestUnflushedSeqNum uint64, cFiles *Level0CompactionFiles,
+) bool {
+	index := sort.Search(len(s.Files[sl]), func(i int) bool {
+		return s.Files[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex
+	})
+	for ; index < len(s.Files[sl]); index++ {
+		f := s.Files[sl][index]
+		if f.minIntervalIndex > cFiles.maxIntervalIndex {
+			break
+		}
+		if cFiles.FilesIncluded[f.l0Index] || f.LargestSeqNum >= earliestUnflushedSeqNum {
+			continue
+		}
+		if f.Compacting {
+			return false
+		}
+		cFiles.addFile(f)
+	}
+	return true
+}
+
+// PickIntraL0Compaction picks an intra-L0 compaction for files in this
+// sublevel. This method is only called when a base compaction cannot be chosen.
+// See comment above PickBaseCompaction for heuristics involved in this
+// selection.
+func (s *L0SubLevels) PickIntraL0Compaction(
+	earliestUnflushedSeqNum uint64, minCompactionDepth int,
+) (*Level0CompactionFiles, error) {
+	return s.pickCompaction(true, minCompactionDepth, nil, earliestUnflushedSeqNum)
+}
+
+func (s *L0SubLevels) intraL0CompactionUsingSeed(
+	f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int,
+) *Level0CompactionFiles {
+	// We know that all the files that overlap with intervalIndex have
+	// LargestSeqNum < earliestUnflushedSeqNum, but for other intervals
+	// we need to exclude files >= earliestUnflushedSeqNum
+
+	cFiles := &Level0CompactionFiles{
+		Files:                           []*FileMetadata{f},
+		seedInterval:                    intervalIndex,
+		seedIntervalStackDepthReduction: 1,
+		seedIntervalExtremeLevel:        f.subLevel,
+		minIntervalIndex:                f.minIntervalIndex,
+		maxIntervalIndex:                f.maxIntervalIndex,
+		fileBytes:                       f.Size,
+		isIntraL0:                       true,
+		earliestUnflushedSeqNum:         earliestUnflushedSeqNum,
+	}
+	sl := f.subLevel
+	filesIncluded := make([]bool, len(s.filesByAge))
+	filesIncluded[f.l0Index] = true
+	cFiles.FilesIncluded = filesIncluded
+
+	// The seed file captures all files in the higher level that fall in the
+	// range of intervals. That may extend the range of intervals so for
+	// correctness we need to capture all files in the next higher level that
+	// fall in this extended interval and so on. This can result in an
+	// inverted triangular shape like the following where again the X axis is the
+	// key intervals and the Y axis is oldest to youngest. Note that it is not
+	// necessary for correctness to fill out the shape at lower sub-levels to
+	// make it more rectangular since the invariant only requires that if we
+	// move an older seqnum for key k into a file that has a higher seqnum, we
+	// also move all younger seqnums for that key k into that file.
+	//                  -----
+	//                   ---
+	//                    -
+	//
+	// It may be better for performance to have a more rectangular shape since
+	// it will reduce the stack depth for more intervals. But there is also
+	// the danger that in explicitly trying to construct a more rectangular
+	// shape we will be forced to pull in a file that is already compacting.
+	// We assume that the performance concern is not a practical issue.
+	for currLevel := sl + 1; currLevel < len(s.Files); currLevel++ {
+		if !s.extendFiles(currLevel, earliestUnflushedSeqNum, cFiles) {
+			// Failed due to ongoing compaction.
+			return nil
+		}
+	}
+
+	// Now that we have a candidate group of files we can optionally add to it
+	// by stacking more files from intervalIndex and repeating. This is an
+	// optional activity so when it fails we can fallback to the last
+	// successful candidate. Currently the code keeps adding until it can't
+	// add more, but we could optionally stop based on
+	// levelOCompactionFiles.fileBytes being too large.
+	lastCandidate := &Level0CompactionFiles{}
+	*lastCandidate = *cFiles
+	slfList := s.orderedIntervals[intervalIndex].subLevelAndFileList
+	slIndex := len(slfList) - 1
+	for {
+		if slfList[slIndex].fileIndex == f.l0Index {
+			break
+		}
+		slIndex--
+	}
+	slIndex--
+	for ; slIndex >= 0; slIndex-- {
+		sl := slfList[slIndex].subLevel
+		f2 := s.filesByAge[slfList[slIndex].fileIndex]
+		if f2.Compacting {
+			break
+		}
+		cFiles.seedIntervalStackDepthReduction++
+		cFiles.seedIntervalExtremeLevel = sl
+		cFiles.addFile(f2)
+		done := false
+		for currLevel := sl + 1; currLevel < len(s.Files); currLevel++ {
+			if !s.extendFiles(currLevel, earliestUnflushedSeqNum, cFiles) {
+				// Failed to extend due to ongoing compaction.
+				done = true
+				break
+			}
+		}
+		if done {
+			break
+		}
+		if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
+			cFiles.fileBytes > 100<<20 &&
+			(float64(cFiles.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || cFiles.fileBytes > 500<<20) {
+			break
+		}
+		*lastCandidate = *cFiles
+	}
+	if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
+		for i := range lastCandidate.FilesIncluded {
+			lastCandidate.FilesIncluded[i] = false
+		}
+		for _, f := range lastCandidate.Files {
+			lastCandidate.FilesIncluded[f.l0Index] = true
+		}
+		s.extendCandidateToRectangle(
+			lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false)
+		return lastCandidate
+	}
+	return nil
+}
+
+// ExtendL0ForBaseCompactionTo extends the specified base compaction candidate
+// Level0CompactionFiles to cover all L0 files in the specified key interval,
+// by calling extendCandidateToRectangle.
+func (s *L0SubLevels) ExtendL0ForBaseCompactionTo(
+	smallest []byte, largest []byte, candidate *Level0CompactionFiles,
+) bool {
+	firstIntervalIndex := sort.Search(len(s.orderedIntervals), func(i int) bool {
+		// Need to start at >= smallest since if we widen too much we may miss
+		// an Lbase file that overlaps with an L0 file that will get picked in
+		// this widening, which would be bad. This interval will not start with
+		// an immediate successor key.
+		return s.cmp(smallest, s.orderedIntervals[i].startKey.key) <= 0
+	})
+	// First interval that starts at or beyond the largest. This interval will not
+	// start with an immediate successor key.
+	lastIntervalIndex := sort.Search(len(s.orderedIntervals), func(i int) bool {
+		return s.cmp(largest, s.orderedIntervals[i].startKey.key) < 0
+	})
+	// Right now, lastIntervalIndex has a start that's higher than largest.
+	// The previous interval, by definition, has an end key higher than largest.
+	// Iterate back twice to get the last interval that's completely within
+	// [smallest, largest]. Except in the case where we went past the end of the
+	// list; in that case, the last interval to include is the very last
+	// interval in the list.
+	if lastIntervalIndex < len(s.orderedIntervals) {
+		lastIntervalIndex--
+	}
+	lastIntervalIndex--
+	if lastIntervalIndex < firstIntervalIndex {
+		return false
+	}
+	return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true)
+}
+
+// Best-effort attempt to make the compaction include more files in the
+// rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and
+// bounded on one side of the Y axis by candidate.seeIntervalExtremeLevel (the
+// other side is 0 for L0 => Lbase compactions and len(s.Files)-1 for
+// intra-L0 compactions).
+//
+// This is strictly an optional extension; at any point where we can't feasibly
+// add more files, the sublevel iteration can be halted early and candidate will
+// still be a correct compaction candidate.
+func (s *L0SubLevels) extendCandidateToRectangle(
+	minIntervalIndex int, maxIntervalIndex int, candidate *Level0CompactionFiles, isBase bool,
+) bool {
+	candidate.preExtensionMinInterval = candidate.minIntervalIndex
+	candidate.preExtensionMaxInterval = candidate.maxIntervalIndex
+	// Extend {min,max}IntervalIndex to include all of the candidate's current
+	// bounds.
+	if minIntervalIndex > candidate.minIntervalIndex {
+		minIntervalIndex = candidate.minIntervalIndex
+	}
+	if maxIntervalIndex < candidate.maxIntervalIndex {
+		maxIntervalIndex = candidate.maxIntervalIndex
+	}
+	var startLevel int
+	var increment int
+	var limitReached func(int) bool
+	if isBase {
+		startLevel = 0
+		increment = +1
+		limitReached = func(sl int) bool {
+			return sl > candidate.seedIntervalExtremeLevel
+		}
+	} else {
+		startLevel = len(s.Files) - 1
+		increment = -1
+		limitReached = func(sl int) bool {
+			return sl < candidate.seedIntervalExtremeLevel
+		}
+	}
+	// Stats for files.
+	addedCount := 0
+	// Iterate from the oldest sub-level for L0 => Lbase and youngest
+	// sub-level for intra-L0. The idea here is that anything that can't
+	// be included from that level constrains what can be included from
+	// the next level. This change in constraint is directly incorporated
+	// into minIntervalIndex, maxIntervalIndex.
+	for sl := startLevel; !limitReached(sl); sl += increment {
+		files := s.Files[sl]
+		// Find the first file that overlaps with minIntervalIndex.
+		index := sort.Search(len(files), func(i int) bool {
+			return minIntervalIndex <= files[i].maxIntervalIndex
+		})
+		// Track the files that are fully within the current constraint
+		// of [minIntervalIndex, maxIntervalIndex].
+		firstIndex := -1
+		lastIndex := -1
+		for ; index < len(files); index++ {
+			f := files[index]
+			if f.minIntervalIndex > maxIntervalIndex {
+				break
+			}
+			include := true
+			// Extends out on the left so can't be included. This narrows
+			// what we can included in the next level.
+			if f.minIntervalIndex < minIntervalIndex {
+				include = false
+				minIntervalIndex = f.maxIntervalIndex + 1
+			}
+			// Extends out on the right so can't be included.
+			if f.maxIntervalIndex > maxIntervalIndex {
+				include = false
+				maxIntervalIndex = f.minIntervalIndex - 1
+			}
+			if !include {
+				continue
+			}
+			if firstIndex == -1 {
+				firstIndex = index
+			}
+			lastIndex = index
+		}
+		if minIntervalIndex > maxIntervalIndex {
+			// We excluded files that prevent continuation.
+			break
+		}
+		if firstIndex < 0 {
+			// No files to add in this sub-level.
+			continue
+		}
+		// We have the files in [firstIndex, lastIndex] as potential for
+		// inclusion. Some of these may already have been picked. Some
+		// of them may be already compacting. The latter is tricky since
+		// we have to decide whether to contract minIntervalIndex or
+		// maxIntervalIndex when we encounter an already compacting file.
+		// We pick the longest sequence between firstIndex
+		// and lastIndex of non-compacting files -- this is represented by
+		// [candidateNonCompactingFirst, candidateNonCompactingLast].
+		nonCompactingFirst := -1
+		currentRunHasAlreadyPickedFiles := false
+		candidateNonCompactingFirst := -1
+		candidateNonCompactingLast := -1
+		candidateHasAlreadyPickedFiles := false
+		for index = firstIndex; index <= lastIndex; index++ {
+			f := files[index]
+			if f.Compacting {
+				if nonCompactingFirst != -1 {
+					last := index - 1
+					// Prioritize runs of consecutive non-compacting files that
+					// have files that have already been picked. That is to say,
+					// if candidateHasAlreadyPickedFiles == true, we stick with
+					// it, and if currentRunHasAlreadyPickedfiles == true, we
+					// pick that run even if it contains fewer files than the
+					// previous candidate.
+					if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
+						currentRunHasAlreadyPickedFiles ||
+						(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
+						candidateNonCompactingFirst = nonCompactingFirst
+						candidateNonCompactingLast = last
+						candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
+					}
+				}
+				nonCompactingFirst = -1
+				currentRunHasAlreadyPickedFiles = false
+				continue
+			}
+			if nonCompactingFirst == -1 {
+				nonCompactingFirst = index
+			}
+			if candidate.FilesIncluded[f.l0Index] {
+				currentRunHasAlreadyPickedFiles = true
+			}
+		}
+		// Logic duplicated from inside the for loop above.
+		if nonCompactingFirst != -1 {
+			last := index - 1
+			if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
+				currentRunHasAlreadyPickedFiles ||
+				(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
+				candidateNonCompactingFirst = nonCompactingFirst
+				candidateNonCompactingLast = last
+				candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
+			}
+		}
+		if candidateNonCompactingFirst == -1 {
+			// All files are compacting. There will be gaps that we could exploit
+			// to continue, but don't bother.
+			break
+		}
+		// May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level.
+		if candidateNonCompactingFirst > firstIndex {
+			minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1
+		}
+		if candidateNonCompactingLast < lastIndex {
+			maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1
+		}
+		for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ {
+			f := files[index]
+			if f.Compacting {
+				panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum))
+			}
+			if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum {
+				continue
+			}
+			if !candidate.FilesIncluded[f.l0Index] {
+				addedCount++
+				candidate.addFile(f)
+			}
+		}
+	}
+	return addedCount > 0
+}
diff --git a/internal/manifest/l0_sublevels_test.go b/internal/manifest/l0_sublevels_test.go
index 5723d12b7b..955caed294 100644
--- a/internal/manifest/l0_sublevels_test.go
+++ b/internal/manifest/l0_sublevels_test.go
@@ -7,7 +7,9 @@ package manifest
 import (
 	"fmt"
 	"io"
+	"math"
 	"os"
+	"sort"
 	"strconv"
 	"strings"
 	"testing"
@@ -15,6 +17,7 @@ import (
 	"github.com/cockroachdb/pebble/internal/base"
 	"github.com/cockroachdb/pebble/internal/datadriven"
 	"github.com/cockroachdb/pebble/internal/record"
+	"github.com/stretchr/testify/require"
 )
 
 func readManifest(filename string) (*Version, error) {
@@ -47,6 +50,37 @@ func readManifest(filename string) (*Version, error) {
 	return v, nil
 }
 
+func TestL0SubLevels_LargeImportL0(t *testing.T) {
+	v, err := readManifest("testdata/MANIFEST_import")
+	require.NoError(t, err)
+
+	subLevels, err := NewL0SubLevels(v.Files[0], base.DefaultComparer.Compare, base.DefaultFormatter, 5<<20)
+	require.NoError(t, err)
+	fmt.Printf("L0SubLevels:\n%s\n\n", subLevels)
+
+	for i := 0; ; i++ {
+		c, err := subLevels.PickBaseCompaction(2, nil)
+		require.NoError(t, err)
+		if c == nil {
+			break
+		}
+		fmt.Printf("%d: base compaction: filecount: %d, bytes: %d, interval: [%d, %d], seed depth: %d\n",
+			i, len(c.Files), c.fileBytes, c.minIntervalIndex, c.maxIntervalIndex, c.seedIntervalStackDepthReduction)
+		require.NoError(t, subLevels.UpdateStateForStartedCompaction(c, true))
+	}
+
+	for i := 0; ; i++ {
+		c, err := subLevels.PickIntraL0Compaction(math.MaxUint64, 2)
+		require.NoError(t, err)
+		if c == nil {
+			break
+		}
+		fmt.Printf("%d: intra-L0 compaction: filecount: %d, bytes: %d, interval: [%d, %d], seed depth: %d\n",
+			i, len(c.Files), c.fileBytes, c.minIntervalIndex, c.maxIntervalIndex, c.seedIntervalStackDepthReduction)
+		require.NoError(t, subLevels.UpdateStateForStartedCompaction(c, false))
+	}
+}
+
 func TestL0SubLevels(t *testing.T) {
 	parseMeta := func(s string) (*FileMetadata, error) {
 		parts := strings.Split(s, ":")
@@ -101,6 +135,7 @@ func TestL0SubLevels(t *testing.T) {
 	baseLevel := NumLevels - 1
 
 	datadriven.RunTest(t, "testdata/l0_sublevels", func(td *datadriven.TestData) string {
+		pickBaseCompaction := false
 		switch td.Cmd {
 		case "define":
 			fileMetas = [NumLevels][]*FileMetadata{}
@@ -185,6 +220,71 @@ func TestL0SubLevels(t *testing.T) {
 
 			var builder strings.Builder
 			builder.WriteString(sublevels.describe(true))
+			return builder.String()
+		case "pick-base-compaction":
+			pickBaseCompaction = true
+			fallthrough
+		case "pick-intra-l0-compaction":
+			minCompactionDepth := 3
+			earliestUnflushedSeqNum := uint64(math.MaxUint64)
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "min_depth":
+					minCompactionDepth, err = strconv.Atoi(arg.Vals[0])
+					if err != nil {
+						t.Fatal(err)
+					}
+				case "earliest_unflushed_seqnum":
+					eusnInt, err := strconv.Atoi(arg.Vals[0])
+					if err != nil {
+						t.Fatal(err)
+					}
+					earliestUnflushedSeqNum = uint64(eusnInt)
+				}
+			}
+
+			var lcf *Level0CompactionFiles
+			if pickBaseCompaction {
+				lcf, err = sublevels.PickBaseCompaction(minCompactionDepth, fileMetas[baseLevel])
+				if err == nil && lcf != nil {
+					// Try to extend the base compaction into a more rectangular
+					// shape, using the smallest/largest keys of overlapping
+					// base files. This mimics the logic the compactor is
+					// expected to implement.
+					baseFiles := fileMetas[baseLevel]
+					firstFile := sort.Search(len(baseFiles), func(i int) bool {
+						return sublevels.cmp(baseFiles[i].Largest.UserKey, sublevels.orderedIntervals[lcf.minIntervalIndex].startKey.key) >= 0
+					})
+					lastFile := sort.Search(len(baseFiles), func(i int) bool {
+						return sublevels.cmp(baseFiles[i].Smallest.UserKey, sublevels.orderedIntervals[lcf.maxIntervalIndex+1].startKey.key) >= 0
+					})
+					lastFile--
+					sublevels.ExtendL0ForBaseCompactionTo(
+						baseFiles[firstFile].Smallest.UserKey,
+						baseFiles[lastFile].Largest.UserKey,
+						lcf)
+				}
+			} else {
+				lcf, err = sublevels.PickIntraL0Compaction(earliestUnflushedSeqNum, minCompactionDepth)
+			}
+			if err != nil {
+				return fmt.Sprintf("error: %s", err.Error())
+			}
+			if lcf == nil {
+				return "no compaction picked"
+			}
+			var builder strings.Builder
+			builder.WriteString(fmt.Sprintf("compaction picked with stack depth reduction %d\n", lcf.seedIntervalStackDepthReduction))
+			for i, file := range lcf.Files {
+				builder.WriteString(file.FileNum.String())
+				if i < len(lcf.Files) - 1 {
+					builder.WriteByte(',')
+				}
+			}
+			startKey := sublevels.orderedIntervals[lcf.seedInterval].startKey
+			endKey := sublevels.orderedIntervals[lcf.seedInterval+1].startKey
+			builder.WriteString(fmt.Sprintf("\nseed interval: %s-%s", startKey.key, endKey.key))
+
 			return builder.String()
 		case "read-amp":
 			return strconv.Itoa(sublevels.ReadAmplification())
@@ -208,6 +308,35 @@ func TestL0SubLevels(t *testing.T) {
 				}
 			}
 			return "OK"
+		case "update-state-for-compaction":
+			var fileNums []base.FileNum
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "files":
+					for _, val := range arg.Vals {
+						fileNum, err := strconv.ParseUint(val, 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						fileNums = append(fileNums, base.FileNum(fileNum))
+					}
+				}
+			}
+			files := make([]*FileMetadata, 0, len(fileNums))
+			for _, num := range fileNums {
+				for _, f := range fileMetas[0] {
+					if f.FileNum == num {
+						files = append(files, f)
+						break
+					}
+				}
+			}
+			if err := sublevels.UpdateStateForManualCompaction(files); err != nil {
+				return err.Error()
+			}
+			return "OK"
+		case "describe":
+			return sublevels.describe(true)
 		}
 		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
 	})
@@ -221,10 +350,29 @@ func BenchmarkL0SubLevelsInit(b *testing.B) {
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
 		sl, err := NewL0SubLevels(v.Files[0], base.DefaultComparer.Compare, base.DefaultFormatter, 5<<20)
-		if err != nil {
-			b.Fatal(err)
-		} else if sl == nil {
+		require.NoError(b, err)
+		if sl == nil {
 			b.Fatal("expected non-nil L0SubLevels to be generated")
 		}
 	}
 }
+
+func BenchmarkL0SubLevelsInitAndPick(b *testing.B) {
+	v, err := readManifest("testdata/MANIFEST_import")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for n := 0; n < b.N; n++ {
+		sl, err := NewL0SubLevels(v.Files[0], base.DefaultComparer.Compare, base.DefaultFormatter, 5<<20)
+		require.NoError(b, err)
+		if sl == nil {
+			b.Fatal("expected non-nil L0SubLevels to be generated")
+		}
+		c, err := sl.PickBaseCompaction(2, nil)
+		require.NoError(b, err)
+		if c == nil {
+			b.Fatal("expected non-nil compaction to be generated")
+		}
+	}
+}
diff --git a/internal/manifest/testdata/l0_sublevels b/internal/manifest/testdata/l0_sublevels
index eb4b93d502..90f604004d 100644
--- a/internal/manifest/testdata/l0_sublevels
+++ b/internal/manifest/testdata/l0_sublevels
@@ -89,8 +89,83 @@ max-depth-after-ongoing-compactions
 ----
 5
 
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 5
+000006,000003,000005,000009,000010,000001,000002
+seed interval: f-f
+
+# SSTables 0001 and 0002 are optional additions to the above compaction, as they
+# overlap with base files that overlap with L0 files in the seed interval.
+# Marking 0002 as compacting should be enough to exclude both from the
+# chosen compaction.
+
+define
+L0
+  0001:a.SET.2-b.SET.3
+  0002:c.SET.3-d.SET.5 intra_l0_compacting
+  0003:e.SET.5-f.SET.7
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.10-i.SET.10
+  0010:f.SET.11-g.SET.11
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 7, sublevels: 5, intervals: 10
+flush split keys(5): [b, d, f, g, h]
+0.4: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000010:f#11,1-g#11,1
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [5, 8]
+	000009:f#10,1-i#10,1
+0.2: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [5, 7]
+	000005:f#6,1-h#9,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000003:e#5,1-f#7,1
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 6]
+	000001:a#2,1-b#3,1
+	000002:c#3,1-d#5,1
+	000006:f#4,1-g#5,1
+compacting file count: 1, base compacting intervals: 
+
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 5
+000006,000003,000005,000009,000010
+seed interval: f-f
+
+# Mark the above compaction as started.
+
+update-state-for-compaction files=(000006,000003,000005,000009,000010)
+----
+OK
+
+describe
+----
+file count: 7, sublevels: 5, intervals: 10
+flush split keys(5): [b, d, f, g, h]
+0.4: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000010:f#11,1-g#11,1
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [5, 8]
+	000009:f#10,1-i#10,1
+0.2: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [5, 7]
+	000005:f#6,1-h#9,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000003:e#5,1-f#7,1
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 6]
+	000001:a#2,1-b#3,1
+	000002:c#3,1-d#5,1
+	000006:f#4,1-g#5,1
+compacting file count: 6, base compacting intervals: [4, 9], 
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
 # Extend one of the SSTables (0009) to the right, and place an SSTable "under"
-# the extension (0011).
+# the extension (0011). This adds it to the compaction.
 
 define
 L0
@@ -116,8 +191,94 @@ flush split keys(3): [g, h, p]
 	000011:n#8,1-p#10,1
 compacting file count: 0, base compacting intervals: 
 
-# Assume a base compaction from the above files is chosen. This should reduce
-# max-depth-after-ongoing-compactions.
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000009,000011,000010
+seed interval: f-g
+
+# Set SSTable 0011 which is "under" SSTable 0009 to IsBaseCompacting = true.
+# This should prevent SSTable 0009 from participating in a base compaction.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.10-p.SET.10
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10 base_compacting
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(3): [g, h, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.2: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#10,1-p#10,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 1, base compacting intervals: [3, 4], 
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000010,000009,000005,000006
+seed interval: f-g
+
+# Raise 00009 to a higher level, so that there's still a stack depth of 3 below
+# it. This should make f-g a candidate for base compaction again.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10 base_compacting
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(3): [g, h, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 1, base compacting intervals: [3, 4], 
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000010
+seed interval: f-g
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006
+seed interval: f-g
+
+max-depth-after-ongoing-compactions
+----
+4
+
+# Assume the above base compaction is chosen. This should reduce max depth after
+# ongoing compactions.
 
 define
 L0
@@ -143,14 +304,291 @@ flush split keys(3): [g, h, p]
 	000011:n#8,1-p#10,1
 compacting file count: 4, base compacting intervals: [0, 1], [3, 4], 
 
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+pick-intra-l0-compaction min_depth=3
+----
+no compaction picked
+
 max-depth-after-ongoing-compactions
 ----
 1
 
+# Ensure that when 0011 is not base compacting, it's chosen for compactions
+# along with 0009.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(3): [g, h, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 0, base compacting intervals: 
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000010,000009,000011
+seed interval: f-g
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+
+# Don't pick a base compaction if the overlapping Lbase files are marked as
+# compacting.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0 compacting
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(3): [g, h, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 0, base compacting intervals: 
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+# Greatly increase the size of SSTable 0009, past 100 << 20. This should make
+# it no longer a candidate for base compaction.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.12-p.SET.12 size=104859600
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(4): [g, h, n, p]
+0.3: file count: 1, bytes: 104859600, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 0, base compacting intervals: 
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000010,000011
+seed interval: f-g
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+
+# However, when the size increase is applied to a lower sublevel that is
+# necessary to include to meet the minimum stack depth reduction, we overlook
+# the size difference and choose the file for compaction anyway.
+
+define
+L0
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11 size=104859600
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(3): [g, h, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 104859600, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:f#6,1-h#9,1
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:f#4,1-g#5,1
+	000011:n#8,1-p#10,1
+compacting file count: 0, base compacting intervals: 
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000010,000009,000011
+seed interval: f-g
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+
 read-amp
 ----
 4
 
+# In L0.0, SST 0007 is marked as base compacting. There are two SSTs to the left
+# of it in the sublevel, and one to its right. The ones to its left should be
+# chosen by extendCandidateToRectangle.
+
+define
+L0
+  0004:h.SET.2-j.SET.4
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0007:k.SET.2-l.SET.4 base_compacting
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(5): [g, h, j, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:f#6,1-h#9,1
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:f#4,1-g#5,1
+	000004:h#2,1-j#4,1
+	000007:k#2,1-l#4,1
+	000011:n#8,1-p#10,1
+compacting file count: 1, base compacting intervals: [5, 5], 
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000004
+seed interval: f-g
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000004,000010
+seed interval: f-g
+
+
+# Now shift the base_compacting marker one SST to the left. But since file 6
+# was already chosen as part of the seed compaction construction, we still
+# prefer to choose it over files 7 and 11.
+
+define
+L0
+  0004:h.SET.2-j.SET.4 base_compacting
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0007:k.SET.2-l.SET.4
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(5): [g, h, j, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:f#6,1-h#9,1
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:f#4,1-g#5,1
+	000004:h#2,1-j#4,1
+	000007:k#2,1-l#4,1
+	000011:n#8,1-p#10,1
+compacting file count: 1, base compacting intervals: [2, 3], 
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006
+seed interval: f-g
+
+# Without any base_compacting markers, all SSTs in the bottom sublevel should
+# be chosen for an intra-L0 compaction.
+
+define
+L0
+  0004:h.SET.2-j.SET.4
+  0005:f.SET.6-h.SET.9
+  0006:f.SET.4-g.SET.5
+  0007:k.SET.2-l.SET.4
+  0009:f.SET.12-p.SET.12
+  0010:f.SET.11-g.SET.11
+  0011:n.SET.8-p.SET.10
+L6
+  0007:a.SET.0-f.SET.0
+  0008:g.SET.0-z.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(5): [g, h, j, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:f#12,1-p#12,1
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:f#11,1-g#11,1
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:f#6,1-h#9,1
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:f#4,1-g#5,1
+	000004:h#2,1-j#4,1
+	000007:k#2,1-l#4,1
+	000011:n#8,1-p#10,1
+compacting file count: 0, base compacting intervals: 
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000004,000007,000011
+seed interval: f-g
+
 define flush_split_max_bytes=32
 L0
   0001:a.SET.2-e.SET.5 size=64