From 1c7a3744a227130729d097cbe88e97b35da04a69 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Wed, 31 May 2023 17:04:25 -0400 Subject: [PATCH] db: add ingest-time splitting of ssts into virtual ones Currently, if we identify boundary overlap in a level during ingest target level calculation, but no data overlap, we are forced to find a target level above the file we saw the overlap with (if we can't fall below it, such as if the existing file is in L6, which happens commonly). This change takes advantage of virtual sstables to split existing sstables into two virtual sstables when an ingested sstable would be able to go into the same level had the sstables been split that way to begin with. Doing this split reduces a lot of write-amp as it avoids us from having to compact the newly-ingested sstable with the sstable it boundary-overlapped with. Fixes #1683. --- compaction.go | 54 +++++- data_test.go | 5 +- ingest.go | 310 ++++++++++++++++++++++++++++++----- ingest_test.go | 34 +++- options.go | 5 + testdata/ingest | 269 ++++++++++++++++++++++++++++++ testdata/ingest_target_level | 76 +++++++++ 7 files changed, 701 insertions(+), 52 deletions(-) diff --git a/compaction.go b/compaction.go index 8382c599f7..530e160b88 100644 --- a/compaction.go +++ b/compaction.go @@ -1905,15 +1905,27 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { ve := &versionEdit{} var level int var err error + var fileToSplit *fileMetadata + var ingestSplitFiles []ingestSplitFile for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { - level, err = ingestTargetLevel( + suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() && + d.FormatMajorVersion() >= FormatVirtualSSTables + level, fileToSplit, err = ingestTargetLevel( d.newIters, d.tableNewRangeKeyIter, iterOpts, d.cmp, c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata, + suggestSplit, ) if err != nil { return nil, err } ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata}) + if fileToSplit != nil { + ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{ + ingestFile: file.FileMetadata, + splitFile: fileToSplit, + level: level, + }) + } levelMetrics := c.metrics[level] if levelMetrics == nil { levelMetrics = &LevelMetrics{} @@ -1923,6 +1935,28 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { levelMetrics.TablesIngested++ } + updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { + levelMetrics := c.metrics[level] + if levelMetrics == nil { + levelMetrics = &LevelMetrics{} + c.metrics[level] = levelMetrics + } + levelMetrics.NumFiles-- + levelMetrics.Size -= int64(m.Size) + for i := range added { + levelMetrics.NumFiles++ + levelMetrics.Size += int64(added[i].Meta.Size) + } + } + + if len(ingestSplitFiles) > 0 { + ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata) + replacedFiles := make(map[base.FileNum][]newFileEntry) + if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil { + return nil, err + } + } + return ve, nil } @@ -2093,6 +2127,24 @@ func (d *DB) flush1() (bytesFlushed uint64, err error) { metrics.BytesIn += d.mu.mem.queue[i].logSize } } + } else if len(ve.DeletedFiles) > 0 { + // c.kind == compactionKindIngestedFlushable && we have deleted files due + // to ingest-time splits. + // + // Iterate through all other compactions, and check if their inputs have + // been replaced due to an ingest-time split. In that case, cancel the + // compaction. + for c2 := range d.mu.compact.inProgress { + for i := range c2.inputs { + iter := c2.inputs[i].files.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok { + c2.cancel.Store(true) + break + } + } + } + } } err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */ func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }) diff --git a/data_test.go b/data_test.go index e3a1e13dc6..3b60c0e635 100644 --- a/data_test.go +++ b/data_test.go @@ -1356,8 +1356,9 @@ func runForceIngestCmd(td *datadriven.TestData, d *DB) error { int, map[*compaction]struct{}, *fileMetadata, - ) (int, error) { - return level, nil + bool, + ) (int, *fileMetadata, error) { + return level, nil, nil }, nil /* shared */, KeyRange{}, nil /* external */) return err } diff --git a/ingest.go b/ingest.go index e650ac7702..095f8eb815 100644 --- a/ingest.go +++ b/ingest.go @@ -797,6 +797,10 @@ func overlapWithIterator( return computeOverlapWithSpans(*rangeDelIter) } +// ingestTargetLevel returns the target level for a file being ingested. +// If suggestSplit is true, it accounts for ingest-time splitting as part of +// its target level calculation, and if a split candidate is found, that file +// is returned as the splitFile. func ingestTargetLevel( newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, @@ -806,7 +810,8 @@ func ingestTargetLevel( baseLevel int, compactions map[*compaction]struct{}, meta *fileMetadata, -) (int, error) { + suggestSplit bool, +) (targetLevel int, splitFile *fileMetadata, err error) { // Find the lowest level which does not have any files which overlap meta. We // search from L0 to L6 looking for whether there are any files in the level // which overlap meta. We want the "lowest" level (where lower means @@ -821,6 +826,14 @@ func ingestTargetLevel( // violate the sequence number invariant. // - no file boundary overlap with level i, since that will violate the // invariant that files do not overlap in levels i > 0. + // - if there is only a file overlap at a given level, and no data overlap, + // we can still slot a file at that level. We return the fileMetadata with + // which we have file boundary overlap (must be only one file, as sstable + // bounds are usually tight on user keys) and the caller is expected to split + // that sstable into two virtual sstables, allowing this file to go into that + // level. Note that if we have file boundary overlap with two files, which + // should only happen on rare occasions, we treat it as data overlap and + // don't use this optimization. // // The file boundary overlap check is simpler to conceptualize. Consider the // following example, in which the ingested file lies completely before or @@ -865,12 +878,10 @@ func ingestTargetLevel( // existing point that falls within the ingested table bounds as being "data // overlap". - targetLevel := 0 - // This assertion implicitly checks that we have the current version of // the metadata. if v.L0Sublevels == nil { - return 0, errors.AssertionFailedf("could not read L0 sublevels") + return 0, nil, errors.AssertionFailedf("could not read L0 sublevels") } // Check for overlap over the keys of L0 by iterating over the sublevels. for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ { @@ -896,10 +907,10 @@ func ingestTargetLevel( err := iter.Close() // Closes range del iter as well. err = firstError(err, levelIter.Close()) if err != nil { - return 0, err + return 0, nil, err } if overlap { - return targetLevel, nil + return targetLevel, nil, nil } } @@ -926,26 +937,47 @@ func ingestTargetLevel( err := levelIter.Close() // Closes range del iter as well. err = firstError(err, rkeyLevelIter.Close()) if err != nil { - return 0, err + return 0, nil, err } if overlap { - return targetLevel, nil + return targetLevel, splitFile, nil } // Check boundary overlap. + var candidateSplitFile *fileMetadata boundaryOverlaps := v.Overlaps(level, cmp, meta.Smallest.UserKey, meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) if !boundaryOverlaps.Empty() { - continue + // We are already guaranteed to not have any data overlaps with files + // in boundaryOverlaps, otherwise we'd have returned in the above if + // statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for + // the case where we can slot this file into the current level despite + // a boundary overlap, by splitting one existing file into two virtual + // sstables. + if suggestSplit && boundaryOverlaps.Len() == 1 { + iter := boundaryOverlaps.Iter() + candidateSplitFile = iter.First() + } else { + // We either don't want to suggest ingest-time splits (i.e. + // !suggestSplit), or we boundary-overlapped with more than one file. + continue + } } - // Check boundary overlap with any ongoing compactions. + // Check boundary overlap with any ongoing compactions. We consider an + // overlapping compaction that's writing files to an output level as + // equivalent to boundary overlap with files in that output level. + // + // We cannot check for data overlap with the new SSTs compaction will produce + // since compaction hasn't been done yet. However, there's no need to check + // since all keys in them will be from levels in [c.startLevel, + // c.outputLevel], and all those levels have already had their data overlap + // tested negative (else we'd have returned earlier). // - // We cannot check for data overlap with the new SSTs compaction will - // produce since compaction hasn't been done yet. However, there's no need - // to check since all keys in them will either be from c.startLevel or - // c.outputLevel, both levels having their data overlap already tested - // negative (else we'd have returned earlier). + // An alternative approach would be to cancel these compactions and proceed + // with an ingest-time split on this level if necessary. However, compaction + // cancellation can result in significant wasted effort and is best avoided + // unless necessary. overlaps := false for c := range compactions { if c.outputLevel == nil || level != c.outputLevel.level { @@ -959,9 +991,10 @@ func ingestTargetLevel( } if !overlaps { targetLevel = level + splitFile = candidateSplitFile } } - return targetLevel, nil + return targetLevel, splitFile, nil } // Ingest ingests a set of sstables into the DB. Ingestion of the files is @@ -1820,7 +1853,124 @@ type ingestTargetLevelFunc func( baseLevel int, compactions map[*compaction]struct{}, meta *fileMetadata, -) (int, error) + suggestSplit bool, +) (int, *fileMetadata, error) + +type ingestSplitFile struct { + // ingestFile is the file being ingested. + ingestFile *fileMetadata + // splitFile is the file that needs to be split to allow ingestFile to slot + // into `level` level. + splitFile *fileMetadata + // The level where ingestFile will go (and where splitFile already is). + level int +} + +// ingestSplit splits files specified in `files` and updates ve in-place to +// account for existing files getting split into two virtual sstables. The map +// `replacedFiles` contains an in-progress map of all files that have been +// replaced with new virtual sstables in this version edit so far, which is also +// updated in-place. +// +// d.mu as well as the manifest lock must be held when calling this method. +func (d *DB) ingestSplit( + ve *versionEdit, + updateMetrics func(*fileMetadata, int, []newFileEntry), + files []ingestSplitFile, + replacedFiles map[base.FileNum][]newFileEntry, +) error { + for _, s := range files { + // replacedFiles can be thought of as a tree, where we start iterating with + // s.splitFile and run its fileNum through replacedFiles, then find which of + // the replaced files overlaps with s.ingestFile, which becomes the new + // splitFile, then we check splitFile's replacements in replacedFiles again + // for overlap with s.ingestFile, and so on until we either can't find the + // current splitFile in replacedFiles (i.e. that's the file that now needs to + // be split), or we don't find a file that overlaps with s.ingestFile, which + // means a prior ingest split already produced enough room for s.ingestFile + // to go into this level without necessitating another ingest split. + splitFile := s.splitFile + for splitFile != nil { + replaced, ok := replacedFiles[splitFile.FileNum] + if !ok { + break + } + updatedSplitFile := false + for i := range replaced { + if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) { + if updatedSplitFile { + // This should never happen because the earlier ingestTargetLevel + // function only finds split file candidates that are guaranteed to + // have no data overlap, only boundary overlap. See the comments + // in that method to see the definitions of data vs boundary + // overlap. That, plus the fact that files in `replaced` are + // guaranteed to have file bounds that are tight on user keys + // (as that's what `d.excise` produces), means that the only case + // where we overlap with two or more files in `replaced` is if we + // actually had data overlap all along, or if the ingestion files + // were overlapping, either of which is an invariant violation. + panic("updated with two files in ingestSplit") + } + splitFile = replaced[i].Meta + updatedSplitFile = true + } + } + if !updatedSplitFile { + // None of the replaced files overlapped with the file being ingested. + // This can happen if we've already excised a span overlapping with + // this file, or if we have consecutive ingested files that can slide + // within the same gap between keys in an existing file. For instance, + // if an existing file has keys a and g and we're ingesting b-c, d-e, + // the first loop iteration will split the existing file into one that + // ends in a and another that starts at g, and the second iteration will + // fall into this case and require no splitting. + // + // No splitting necessary. + splitFile = nil + } + } + if splitFile == nil { + continue + } + // NB: excise operates on [start, end). We're splitting at [start, end] + // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation + // of exclusive vs inclusive end bounds should not make a difference here + // as we're guaranteed to not have any data overlap between splitFile and + // s.ingestFile, so panic if we do see a newly added file with an endKey + // equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel() + added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level) + if err != nil { + return err + } + if _, ok := ve.DeletedFiles[deletedFileEntry{ + Level: s.level, + FileNum: splitFile.FileNum, + }]; !ok { + panic("did not split file that was expected to be split") + } + replacedFiles[splitFile.FileNum] = added + for i := range added { + if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) { + panic("ingest-time split produced a file that overlaps with ingested file") + } + } + updateMetrics(splitFile, s.level, added) + } + // Flatten the version edit by removing any entries from ve.NewFiles that + // are also in ve.DeletedFiles. + newNewFiles := ve.NewFiles[:0] + for i := range ve.NewFiles { + fn := ve.NewFiles[i].Meta.FileNum + deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn} + if _, ok := ve.DeletedFiles[deEntry]; ok { + delete(ve.DeletedFiles, deEntry) + } else { + newNewFiles = append(newNewFiles, ve.NewFiles[i]) + } + } + ve.NewFiles = newNewFiles + return nil +} func (d *DB) ingestApply( jobID int, @@ -1835,7 +1985,7 @@ func (d *DB) ingestApply( ve := &versionEdit{ NewFiles: make([]newFileEntry, lr.fileCount), } - if exciseSpan.Valid() { + if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) { ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{} } metrics := make(map[int]*LevelMetrics) @@ -1860,9 +2010,17 @@ func (d *DB) ingestApply( } } + shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && + d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables current := d.mu.versions.currentVersion() baseLevel := d.mu.versions.picker.getBaseLevel() iterOps := IterOptions{logger: d.opts.Logger} + // filesToSplit is a list where each element is a pair consisting of a file + // being ingested and a file being split to make room for an ingestion into + // that level. Each ingested file will appear at most once in this list. It + // is possible for split files to appear twice in this list. + filesToSplit := make([]ingestSplitFile, 0) + checkCompactions := false for i := 0; i < lr.fileCount; i++ { // Determine the lowest level in the LSM for which the sstable doesn't // overlap any existing files in the level. @@ -1895,6 +2053,7 @@ func (d *DB) ingestApply( if externalFile { ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) } + var splitFile *fileMetadata if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { // This file fits perfectly within the excise span. We can slot it at // L6, or sharedLevelsStart - 1 if we have shared files. @@ -1907,7 +2066,31 @@ func (d *DB) ingestApply( f.Level = 6 } } else { - f.Level, err = findTargetLevel(d.newIters, d.tableNewRangeKeyIter, iterOps, d.cmp, current, baseLevel, d.mu.compact.inProgress, m) + // TODO(bilal): findTargetLevel does disk IO (reading files for data + // overlap) even though we're holding onto d.mu. Consider unlocking + // d.mu while we do this. We already hold versions.logLock so we should + // not see any version applications while we're at this. The one + // complication here would be pulling out the mu.compact.inProgress + // check from findTargetLevel, as that requires d.mu to be held. + f.Level, splitFile, err = findTargetLevel( + d.newIters, d.tableNewRangeKeyIter, iterOps, d.cmp, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) + } + + if splitFile != nil { + if invariants.Enabled { + if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil { + panic("splitFile returned is not in level it should be") + } + } + // We take advantage of the fact that we won't drop the db mutex + // between now and the call to logAndApply. So, no files should + // get added to a new in-progress compaction at this point. We can + // avoid having to iterate on in-progress compactions to cancel them + // if none of the files being split have a compacting state. + if splitFile.IsCompacting() { + checkCompactions = true + } + filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level}) } } if err != nil { @@ -1925,6 +2108,26 @@ func (d *DB) ingestApply( levelMetrics.BytesIngested += m.Size levelMetrics.TablesIngested++ } + // replacedFiles maps files excised due to exciseSpan (or splitFiles returned + // by ingestTargetLevel), to files that were created to replace it. This map + // is used to resolve references to split files in filesToSplit, as it is + // possible for a file that we want to split to no longer exist or have a + // newer fileMetadata due to a split induced by another ingestion file, or an + // excise. + replacedFiles := make(map[base.FileNum][]newFileEntry) + updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { + levelMetrics := metrics[level] + if levelMetrics == nil { + levelMetrics = &LevelMetrics{} + metrics[level] = levelMetrics + } + levelMetrics.NumFiles-- + levelMetrics.Size -= int64(m.Size) + for i := range added { + levelMetrics.NumFiles++ + levelMetrics.Size += int64(added[i].Meta.Size) + } + } if exciseSpan.Valid() { // Iterate through all levels and find files that intersect with exciseSpan. // @@ -1946,7 +2149,7 @@ func (d *DB) ingestApply( iter := overlaps.Iter() for m := iter.First(); m != nil; m = iter.Next() { - excised, err := d.excise(exciseSpan, m, ve, level) + newFiles, err := d.excise(exciseSpan, m, ve, level) if err != nil { return nil, err } @@ -1958,19 +2161,19 @@ func (d *DB) ingestApply( // We did not excise this file. continue } - levelMetrics := metrics[level] - if levelMetrics == nil { - levelMetrics = &LevelMetrics{} - metrics[level] = levelMetrics - } - levelMetrics.NumFiles-- - levelMetrics.Size -= int64(m.Size) - for i := range excised { - levelMetrics.NumFiles++ - levelMetrics.Size += int64(excised[i].Meta.Size) - } + replacedFiles[m.FileNum] = newFiles + updateLevelMetricsOnExcise(m, level, newFiles) } } + } + if len(filesToSplit) > 0 { + // For the same reasons as the above call to excise, we hold the db mutex + // while calling this method. + if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil { + return nil, err + } + } + if len(filesToSplit) > 0 || exciseSpan.Valid() { for c := range d.mu.compact.inProgress { if c.versionEditApplied { continue @@ -1985,22 +2188,41 @@ func (d *DB) ingestApply( if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) { c.cancel.Store(true) } + // Check if this compaction's inputs have been replaced due to an + // ingest-time split. In that case, cancel the compaction as a newly picked + // compaction would need to include any new files that slid in between + // previously-existing files. Note that we cancel any compaction that has a + // file that was ingest-split as an input, even if it started before this + // ingestion. + if checkCompactions { + for i := range c.inputs { + iter := c.inputs[i].files.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if _, ok := replacedFiles[f.FileNum]; ok { + c.cancel.Store(true) + break + } + } + } + } } // Check for any EventuallyFileOnlySnapshots that could be watching for // an excise on this span. - for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { - if s.efos == nil { - continue - } - efos := s.efos - // TODO(bilal): We can make this faster by taking advantage of the sorted - // nature of protectedRanges to do a sort.Search, or even maintaining a - // global list of all protected ranges instead of having to peer into every - // snapshot. - for i := range efos.protectedRanges { - if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { - efos.excised.Store(true) - break + if exciseSpan.Valid() { + for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { + if s.efos == nil { + continue + } + efos := s.efos + // TODO(bilal): We can make this faster by taking advantage of the sorted + // nature of protectedRanges to do a sort.Search, or even maintaining a + // global list of all protected ranges instead of having to peer into every + // snapshot. + for i := range efos.protectedRanges { + if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { + efos.excised.Store(true) + break + } } } } diff --git a/ingest_test.go b/ingest_test.go index 38c063e590..d353c29971 100644 --- a/ingest_test.go +++ b/ingest_test.go @@ -1956,16 +1956,28 @@ func TestIngestTargetLevel(t *testing.T) { case "target": var buf bytes.Buffer + suggestSplit := false + for _, cmd := range td.CmdArgs { + switch cmd.Key { + case "suggest-split": + suggestSplit = true + } + } for _, target := range strings.Split(td.Input, "\n") { meta := parseMeta(target) - level, err := ingestTargetLevel( + level, overlapFile, err := ingestTargetLevel( d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger}, d.cmp, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta, + suggestSplit, ) if err != nil { return err.Error() } - fmt.Fprintf(&buf, "%d\n", level) + if overlapFile != nil { + fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum) + } else { + fmt.Fprintf(&buf, "%d\n", level) + } } return buf.String() @@ -1983,7 +1995,7 @@ func TestIngest(t *testing.T) { require.NoError(t, d.Close()) }() - reset := func() { + reset := func(split bool) { if d != nil { require.NoError(t, d.Close()) } @@ -2000,6 +2012,9 @@ func TestIngest(t *testing.T) { }}, FormatMajorVersion: internalFormatNewest, } + opts.Experimental.IngestSplit = func() bool { + return split + } // Disable automatic compactions because otherwise we'll race with // delete-only compactions triggered by ingesting range tombstones. opts.DisableAutomaticCompactions = true @@ -2008,12 +2023,21 @@ func TestIngest(t *testing.T) { d, err = Open("", opts) require.NoError(t, err) } - reset() + reset(false /* split */) datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string { switch td.Cmd { case "reset": - reset() + split := false + for _, cmd := range td.CmdArgs { + switch cmd.Key { + case "enable-split": + split = true + default: + return fmt.Sprintf("unexpected key: %s", cmd.Key) + } + } + reset(split) return "" case "batch": b := d.NewIndexedBatch() diff --git a/options.go b/options.go index 75281ce00b..eee41c9741 100644 --- a/options.go +++ b/options.go @@ -550,6 +550,11 @@ type Options struct { // concurrency slots as determined by the two options is chosen. CompactionDebtConcurrency uint64 + // IngestSplit, if it returns true, allows for ingest-time splitting of + // existing sstables into two virtual sstables to allow ingestion sstables to + // slot into a lower level than they otherwise would have. + IngestSplit func() bool + // ReadCompactionRate controls the frequency of read triggered // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: // diff --git a/testdata/ingest b/testdata/ingest index 37ffc9c8b5..5d292a9d50 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -907,3 +907,272 @@ lsm 000006:[f#12,SET-h#12,SET] 000010:[s#16,RANGEKEYDEL-x#inf,RANGEKEYDEL] 000009:[x#15,SET-y#15,SET] + +reset enable-split +---- + +build ext10 +set a foo +set e bar +---- + +ingest ext10 +---- + +lsm +---- +6: + 000004:[a#10,SET-e#10,SET] + +# The below ingestion should split one existing file. + +build ext11 +set b foobar +set d foobar +---- + +ingest ext11 +---- + +lsm +---- +6: + 000006:[a#10,SET-a#10,SET] + 000005:[b#11,SET-d#11,SET] + 000007:[e#10,SET-e#10,SET] + +iter +first +next +next +next +---- +a: (foo, .) +b: (foobar, .) +d: (foobar, .) +e: (bar, .) + +# This ingestion should not split any files due to data overlap. + +build ext12 +set c foobar +set e baz +---- + +ingest ext12 +---- + +lsm +---- +0.0: + 000008:[c#12,SET-e#12,SET] +6: + 000006:[a#10,SET-a#10,SET] + 000005:[b#11,SET-d#11,SET] + 000007:[e#10,SET-e#10,SET] + +# The below ingestion should fall through one existing file and split another +# file. + +build ext13 +set cc foo +set ccc foooo +---- + +ingest ext13 +---- + +lsm +---- +0.0: + 000008:[c#12,SET-e#12,SET] +6: + 000006:[a#10,SET-a#10,SET] + 000010:[b#11,SET-b#11,SET] + 000009:[cc#13,SET-ccc#13,SET] + 000011:[d#11,SET-d#11,SET] + 000007:[e#10,SET-e#10,SET] + +iter +seek-ge c +next +next +next +next +---- +c: (foobar, .) +cc: (foo, .) +ccc: (foooo, .) +d: (foobar, .) +e: (baz, .) + +# Ingestion splitting doesn't kick in at L0. + +build ext14 +set d updated +set dd new +---- + +ingest ext14 +---- + +lsm +---- +0.1: + 000012:[d#14,SET-dd#14,SET] +0.0: + 000008:[c#12,SET-e#12,SET] +6: + 000006:[a#10,SET-a#10,SET] + 000010:[b#11,SET-b#11,SET] + 000009:[cc#13,SET-ccc#13,SET] + 000011:[d#11,SET-d#11,SET] + 000007:[e#10,SET-e#10,SET] + +iter +seek-lt d +next +next +next +next +---- +ccc: (foooo, .) +d: (updated, .) +dd: (new, .) +e: (baz, .) +. + +# Multi-sstable ingestion batches. This exercises logic to find the appropriate +# file to split for each newly ingested file, as we will be repeatedly splitting +# files into smaller virtual files. + +reset enable-split +---- + +build ext10 +set a foo +set e bar +set g baz +---- + +ingest ext10 +---- + +lsm +---- +6: + 000004:[a#10,SET-g#10,SET] + +build ext11 +set b foobar +set c foobar +---- + +build ext12 +set cc foobar +set d foobar +---- + +# This ingestion should slide in the same gap between keys in ext10. + +ingest ext11 ext12 +---- + +lsm +---- +6: + 000007:[a#10,SET-a#10,SET] + 000005:[b#11,SET-c#11,SET] + 000006:[cc#12,SET-d#12,SET] + 000008:[e#10,SET-g#10,SET] + +# A virtual sstable produced from an ingest split can be ingest split again. + +build ext13 +set ee foooo +set f bar +---- + +ingest ext13 +---- + +lsm +---- +6: + 000007:[a#10,SET-a#10,SET] + 000005:[b#11,SET-c#11,SET] + 000006:[cc#12,SET-d#12,SET] + 000010:[e#10,SET-e#10,SET] + 000009:[ee#13,SET-f#13,SET] + 000011:[g#10,SET-g#10,SET] + +reset enable-split +---- + +build ext10 +set a foo +set e bar +set g baz +---- + +ingest ext10 +---- + +lsm +---- +6: + 000004:[a#10,SET-g#10,SET] + +build ext11 +set b foobar +set c foobar +---- + +build ext12 +set cc foobar +set d foobar +---- + +build ext13 +set ee foooo +set f bar +---- + +# This ingestion should split ext10 twice, and land two files on one side +# of a key in it, and another file on another side of it. + +ingest ext11 ext12 ext13 +---- + +lsm +---- +6: + 000008:[a#10,SET-a#10,SET] + 000005:[b#11,SET-c#11,SET] + 000006:[cc#12,SET-d#12,SET] + 000010:[e#10,SET-e#10,SET] + 000007:[ee#13,SET-f#13,SET] + 000011:[g#10,SET-g#10,SET] + +iter +first +next +next +next +next +next +next +next +next +next +---- +a: (foo, .) +b: (foobar, .) +c: (foobar, .) +cc: (foobar, .) +d: (foobar, .) +e: (bar, .) +ee: (foooo, .) +f: (bar, .) +g: (baz, .) +. diff --git a/testdata/ingest_target_level b/testdata/ingest_target_level index 66d8d65924..e6b8edf1a3 100644 --- a/testdata/ingest_target_level +++ b/testdata/ingest_target_level @@ -246,3 +246,79 @@ target rkey:a-c ---- 4 + +# Cases with boundary overlap and no data overlap. With suggest-split off +# we get a target level of L0, but with suggest-split on, we get suggested +# a file split. + +define +L6 + a.SET.2:2 + d.SET.3:3 +L6 + f.SET.4:4 + k.SET.6:6 +---- +6: + 000004:[a#2,SET-d#3,SET] + 000005:[f#4,SET-k#6,SET] + +target +b-c +e-g +---- +5 +5 + +target suggest-split +b-c +e-g +---- +6 (split file: 000004) +5 + +target suggest-split +g-i +---- +6 (split file: 000005) + +# suggest-split recognizes and avoids in-progress compactions. + +define +L6 + a.SET.2:2 + d.SET.3:3 +L6 + f.SET.4:4 + k.SET.6:6 + compact:f-k +---- +6: + 000004:[a#2,SET-d#3,SET] + 000005:[f#4,SET-k#6,SET] + +target suggest-split +g-i +---- +5 + +# Ingestion splitting correctly recognizes data overlap in L6, and suggests +# split in L5. + +define +L5 + a.SET.2:2 + e.SET.3:3 +L6 + c.SET.1:1 + k.SET.1:1 +---- +5: + 000004:[a#2,SET-e#3,SET] +6: + 000005:[c#1,SET-k#1,SET] + +target suggest-split +b-c +---- +5 (split file: 000004)