diff --git a/db.go b/db.go index ea23aaaea9..4c88395ecd 100644 --- a/db.go +++ b/db.go @@ -1001,9 +1001,12 @@ var iterAllocPool = sync.Pool{ // and the specified seqNum will be used as the snapshot seqNum. // - EFOS in file-only state: Only `seqNum` and `vers` are set. All the // relevant SSTs are referenced by the *version. +// - EFOS that has been excised but is in alwaysCreateIters mode (tests only). +// Only `seqNum` and `readState` are set. type snapshotIterOpts struct { - seqNum uint64 - vers *version + seqNum uint64 + vers *version + readState *readState } type batchIterOpts struct { @@ -1057,8 +1060,13 @@ func (d *DB) newIter( // files in the associated version from being deleted if there is a current // compaction. The readState is unref'd by Iterator.Close(). if internalOpts.snapshot.vers == nil { - // NB: loadReadState() calls readState.ref(). - readState = d.loadReadState() + if internalOpts.snapshot.readState != nil { + readState = internalOpts.snapshot.readState + readState.ref() + } else { + // NB: loadReadState() calls readState.ref(). + readState = d.loadReadState() + } } else { // vers != nil internalOpts.snapshot.vers.Ref() @@ -1289,7 +1297,12 @@ func (d *DB) newInternalIter( // compaction. The readState is unref'd by Iterator.Close(). var readState *readState if sOpts.vers == nil { - readState = d.loadReadState() + if sOpts.readState != nil { + readState = sOpts.readState + readState.ref() + } else { + readState = d.loadReadState() + } } if sOpts.vers != nil { sOpts.vers.Ref() diff --git a/internal/metamorphic/meta_test.go b/internal/metamorphic/meta_test.go index 02e9beb8d6..7fd5baab96 100644 --- a/internal/metamorphic/meta_test.go +++ b/internal/metamorphic/meta_test.go @@ -76,6 +76,7 @@ func TestMetaTwoInstance(t *testing.T) { case runOnceFlags.Compare != "": runDirs := strings.Split(runOnceFlags.Compare, ",") onceOpts := runOnceFlags.MakeRunOnceOptions() + onceOpts = append(onceOpts, metamorphic.MultiInstance(2)) metamorphic.Compare(t, runOnceFlags.Dir, runOnceFlags.Seed, runDirs, onceOpts...) case runOnceFlags.RunDir != "": @@ -83,6 +84,7 @@ func TestMetaTwoInstance(t *testing.T) { // runOptions() below) or the user specified it manually in order to re-run // a test. onceOpts := runOnceFlags.MakeRunOnceOptions() + onceOpts = append(onceOpts, metamorphic.MultiInstance(2)) metamorphic.RunOnce(t, runOnceFlags.RunDir, runOnceFlags.Seed, filepath.Join(runOnceFlags.RunDir, "history"), onceOpts...) default: diff --git a/metamorphic/config.go b/metamorphic/config.go index 891c56568a..8047448bd9 100644 --- a/metamorphic/config.go +++ b/metamorphic/config.go @@ -45,6 +45,7 @@ const ( writerDelete writerDeleteRange writerIngest + writerIngestAndExcise writerMerge writerRangeKeyDelete writerRangeKeySet @@ -154,6 +155,7 @@ func defaultConfig() config { writerDelete: 100, writerDeleteRange: 50, writerIngest: 100, + writerIngestAndExcise: 50, writerMerge: 100, writerRangeKeySet: 10, writerRangeKeyUnset: 10, diff --git a/metamorphic/generator.go b/metamorphic/generator.go index 8e7e6d45b3..5845c9b934 100644 --- a/metamorphic/generator.go +++ b/metamorphic/generator.go @@ -171,6 +171,7 @@ func generate(rng *rand.Rand, count uint64, cfg config, km *keyManager) []op { writerDelete: g.writerDelete, writerDeleteRange: g.writerDeleteRange, writerIngest: g.writerIngest, + writerIngestAndExcise: g.writerIngestAndExcise, writerMerge: g.writerMerge, writerRangeKeyDelete: g.writerRangeKeyDelete, writerRangeKeySet: g.writerRangeKeySet, @@ -1174,7 +1175,7 @@ func (g *generator) replicate() { var startKey, endKey []byte startKey = g.randKeyToRead(0.001) // 0.1% new keys endKey = g.randKeyToRead(0.001) // 0.1% new keys - for g.cmp(startKey, endKey) == 0 { + for g.equal(startKey, endKey) { endKey = g.randKeyToRead(0.01) // 1% new keys } if g.cmp(startKey, endKey) > 0 { @@ -1228,28 +1229,15 @@ func (g *generator) newSnapshot() { snapID: snapID, } - // With 75% probability, impose bounds on the keys that may be read with the - // snapshot. Setting bounds allows some runs of the metamorphic test to use - // a EventuallyFileOnlySnapshot instead of a Snapshot, testing equivalence - // between the two for reads within those bounds. - // - // If we're in multi-instance mode, we must always create bounds, as we will - // always create EventuallyFileOnlySnapshots to allow commands that use excises - // (eg. replicateOp) to work. - if g.rng.Float64() < 0.75 || g.dbs.Len() > 1 { - s.bounds = g.generateDisjointKeyRanges( - g.rng.Intn(5) + 1, /* between 1-5 */ - ) - g.snapshotBounds[snapID] = s.bounds - } + // Impose bounds on the keys that may be read with the snapshot. Setting bounds + // allows some runs of the metamorphic test to use a EventuallyFileOnlySnapshot + // instead of a Snapshot, testing equivalence between the two for reads within + // those bounds. + s.bounds = g.generateDisjointKeyRanges( + g.rng.Intn(5) + 1, /* between 1-5 */ + ) + g.snapshotBounds[snapID] = s.bounds g.add(s) - if g.dbs.Len() > 1 { - // Do a flush after each EFOS, if we're in multi-instance mode. This limits - // the testing area of EFOS, but allows them to be used alongside operations - // that do an excise (eg. replicateOp). This will be revisited when - // https://github.com/cockroachdb/pebble/issues/2885 is implemented. - g.add(&flushOp{dbID}) - } } func (g *generator) snapshotClose() { @@ -1454,6 +1442,34 @@ func (g *generator) writerIngest() { }) } +func (g *generator) writerIngestAndExcise() { + if len(g.liveBatches) == 0 { + return + } + + dbID := g.dbs.rand(g.rng) + batchID := g.liveBatches.rand(g.rng) + g.removeBatchFromGenerator(batchID) + + start := g.randKeyToWrite(0.001) + end := g.randKeyToWrite(0.001) + for g.equal(start, end) { + end = g.randKeyToWrite(0.001) + } + if g.cmp(start, end) > 0 { + start, end = end, start + } + derivedDBID := g.objDB[batchID] + + g.add(&ingestAndExciseOp{ + dbID: dbID, + batchID: batchID, + derivedDBID: derivedDBID, + exciseStart: start, + exciseEnd: end, + }) +} + func (g *generator) writerMerge() { if len(g.liveWriters) == 0 { return diff --git a/metamorphic/key_manager.go b/metamorphic/key_manager.go index 5943aa3042..8c1e1ce682 100644 --- a/metamorphic/key_manager.go +++ b/metamorphic/key_manager.go @@ -406,6 +406,10 @@ func (k *keyManager) update(o op) { k.mergeKeysInto(batchID, s.dbID) } k.checkForDelOrSingleDelTransitionInDB(s.dbID) + case *ingestAndExciseOp: + // Merge all keys in the batch with the keys in the DB. + k.mergeKeysInto(s.batchID, s.dbID) + k.checkForDelOrSingleDelTransitionInDB(s.dbID) case *applyOp: // Merge the keys from this writer into the parent writer. k.mergeKeysInto(s.batchID, s.writerID) @@ -517,6 +521,8 @@ func opWrittenKeys(untypedOp op) [][]byte { case *flushOp: case *getOp: case *ingestOp: + case *ingestAndExciseOp: + return [][]byte{t.exciseStart, t.exciseEnd} case *initOp: case *iterFirstOp: case *iterLastOp: diff --git a/metamorphic/ops.go b/metamorphic/ops.go index 3743b8a08e..f01135c01c 100644 --- a/metamorphic/ops.go +++ b/metamorphic/ops.go @@ -518,13 +518,15 @@ func (o *ingestOp) run(t *test, h historyRecorder) { h.Recordf("%s // %v", o, err) } -func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (string, error) { - path := t.opts.FS.PathJoin(t.tmpDir, fmt.Sprintf("ext%d-%d", o.dbID.slot(), i)) +func buildForIngest( + t *test, dbID objID, h historyRecorder, b *pebble.Batch, i int, +) (string, *sstable.WriterMetadata, error) { + path := t.opts.FS.PathJoin(t.tmpDir, fmt.Sprintf("ext%d-%d", dbID.slot(), i)) f, err := t.opts.FS.Create(path) if err != nil { - return "", err + return "", nil, err } - db := t.getDB(o.dbID) + db := t.getDB(dbID) iter, rangeDelIter, rangeKeyIter := private.BatchSort(b) defer closeIters(iter, rangeDelIter, rangeKeyIter) @@ -550,16 +552,16 @@ func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (st // It's possible that we wrote the key on a batch from a db that supported // DeleteSized, but are now ingesting into a db that does not. Detect // this case and translate the key to an InternalKeyKindDelete. - if key.Kind() == pebble.InternalKeyKindDeleteSized && !t.isFMV(o.dbID, pebble.FormatDeleteSizedAndObsolete) { + if key.Kind() == pebble.InternalKeyKindDeleteSized && !t.isFMV(dbID, pebble.FormatDeleteSizedAndObsolete) { value = pebble.LazyValue{} key.SetKind(pebble.InternalKeyKindDelete) } if err := w.Add(*key, value.InPlaceValue()); err != nil { - return "", err + return "", nil, err } } if err := iter.Close(); err != nil { - return "", err + return "", nil, err } iter = nil @@ -569,11 +571,11 @@ func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (st // NB: We don't have to copy the key or value since we're reading from a // batch which doesn't do prefix compression. if err := w.DeleteRange(t.Start, t.End); err != nil { - return "", err + return "", nil, err } } if err := rangeDelIter.Close(); err != nil { - return "", err + return "", nil, err } rangeDelIter = nil } @@ -596,29 +598,35 @@ func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (st } err = rangekey.Coalesce(t.opts.Comparer.Compare, equal, span.Keys, &collapsed.Keys) if err != nil { - return "", err + return "", nil, err } for i := range collapsed.Keys { collapsed.Keys[i].Trailer = base.MakeTrailer(0, collapsed.Keys[i].Kind()) } keyspan.SortKeysByTrailer(&collapsed.Keys) if err := rangekey.Encode(&collapsed, w.AddRangeKey); err != nil { - return "", err + return "", nil, err } } if err := rangeKeyIter.Error(); err != nil { - return "", err + return "", nil, err } if err := rangeKeyIter.Close(); err != nil { - return "", err + return "", nil, err } rangeKeyIter = nil } if err := w.Close(); err != nil { - return "", err + return "", nil, err } - return path, nil + meta, err := w.Metadata() + return path, meta, err +} + +func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (string, error) { + path, _, err := buildForIngest(t, o.dbID, h, b, i) + return path, err } func (o *ingestOp) receiver() objID { return o.dbID } @@ -786,6 +794,84 @@ func (o *ingestOp) String() string { return buf.String() } +type ingestAndExciseOp struct { + dbID objID + batchID objID + derivedDBID objID + exciseStart, exciseEnd []byte +} + +func (o *ingestAndExciseOp) run(t *test, h historyRecorder) { + var err error + b := t.getBatch(o.batchID) + t.clearObj(o.batchID) + if t.testOpts.Opts.Comparer.Compare(o.exciseEnd, o.exciseStart) <= 0 { + panic("non-well-formed excise span") + } + if b.Empty() { + // No-op. + h.Recordf("%s // %v", o, err) + return + } + path, writerMeta, err2 := o.build(t, h, b, 0 /* i */) + if err2 != nil { + h.Recordf("Build(%s) // %v", o.batchID, err2) + return + } + err = firstError(err, err2) + err = firstError(err, b.Close()) + + if writerMeta.Properties.NumEntries == 0 { + // No-op. + h.Recordf("%s // %v", o, err) + return + } + db := t.getDB(o.dbID) + if !t.testOpts.useExcise { + // Do a rangedel and rangekeydel before the ingestion. This mimics the + // behaviour of an excise. + err = firstError(err, db.DeleteRange(o.exciseStart, o.exciseEnd, t.writeOpts)) + err = firstError(err, db.RangeKeyDelete(o.exciseStart, o.exciseEnd, t.writeOpts)) + } + + if t.testOpts.useExcise { + err = firstError(err, withRetries(func() error { + _, err := t.getDB(o.dbID).IngestAndExcise([]string{path}, nil /* sharedSSTs */, pebble.KeyRange{ + Start: o.exciseStart, + End: o.exciseEnd, + }) + return err + })) + } else { + err = firstError(err, withRetries(func() error { + return t.getDB(o.dbID).Ingest([]string{path}) + })) + } + + h.Recordf("%s // %v", o, err) +} + +func (o *ingestAndExciseOp) build( + t *test, h historyRecorder, b *pebble.Batch, i int, +) (string, *sstable.WriterMetadata, error) { + return buildForIngest(t, o.dbID, h, b, i) +} + +func (o *ingestAndExciseOp) receiver() objID { return o.dbID } +func (o *ingestAndExciseOp) syncObjs() objIDSlice { + // Ingest should not be concurrent with mutating the batches that will be + // ingested as sstables. + objs := []objID{o.batchID} + if o.derivedDBID != o.dbID { + objs = append(objs, o.derivedDBID) + } + return objs +} + +func (o *ingestAndExciseOp) String() string { + return fmt.Sprintf("%s.IngestAndExcise(%s, %q, %q)", o.dbID, o.batchID, o.exciseStart, o.exciseEnd) +} + // getOp models a Reader.Get operation. type getOp struct { readerID objID @@ -1353,10 +1439,27 @@ type newSnapshotOp struct { } func (o *newSnapshotOp) run(t *test, h historyRecorder) { + bounds := o.bounds + if len(bounds) == 0 { + panic("bounds unexpectedly unset for newSnapshotOp") + } // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ - if len(t.dbs) > 1 || (len(o.bounds) > 0 && ((11400714819323198485*uint64(t.idx)*t.testOpts.seedEFOS)>>63) == 1) { - s := t.getDB(o.dbID).NewEventuallyFileOnlySnapshot(o.bounds) + createEfos := ((11400714819323198485 * uint64(t.idx) * t.testOpts.seedEFOS) >> 63) == 1 + // If either of these options is true, an EFOS _must_ be created, regardless + // of what the fibonacci hash returned. + excisePossible := t.testOpts.useSharedReplicate || t.testOpts.useExcise + if createEfos || excisePossible { + s := t.getDB(o.dbID).NewEventuallyFileOnlySnapshot(bounds) t.setSnapshot(o.snapID, s) + // If the EFOS isn't guaranteed to always create iterators, we must force + // a flush on this DB so it transitions this EFOS into a file-only snapshot. + if excisePossible && !t.testOpts.efosAlwaysCreatesIters { + err := t.getDB(o.dbID).Flush() + if err != nil { + h.Recordf("%s // %v", o, err) + panic(errors.Wrap(err, "newSnapshotOp")) + } + } } else { s := t.getDB(o.dbID).NewSnapshot() t.setSnapshot(o.snapID, s) @@ -1456,14 +1559,11 @@ func (r *replicateOp) runSharedReplicate( }, func(start, end []byte, keys []keyspan.Key) error { s := keyspan.Span{ - Start: start, - End: end, - Keys: keys, - KeysOrder: 0, + Start: start, + End: end, + Keys: keys, } - return rangekey.Encode(&s, func(k base.InternalKey, v []byte) error { - return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v) - }) + return rangekey.Encode(&s, w.AddRangeKey) }, func(sst *pebble.SharedSSTMeta) error { sharedSSTs = append(sharedSSTs, *sst) @@ -1475,6 +1575,31 @@ func (r *replicateOp) runSharedReplicate( return } + err = w.Close() + if err != nil { + h.Recordf("%s // %v", r, err) + return + } + meta, err := w.Metadata() + if err != nil { + h.Recordf("%s // %v", r, err) + return + } + if len(sharedSSTs) == 0 && meta.Properties.NumEntries == 0 { + // IngestAndExcise below will be a no-op. We should do a + // DeleteRange+RangeKeyDel to mimic the behaviour of the non-shared-replicate + // case. + // + // TODO(bilal): Remove this when we support excises with no matching ingests. + if err := dest.RangeKeyDelete(r.start, r.end, t.writeOpts); err != nil { + h.Recordf("%s // %v", r, err) + return + } + err := dest.DeleteRange(r.start, r.end, t.writeOpts) + h.Recordf("%s // %v", r, err) + return + } + _, err = dest.IngestAndExcise([]string{sstPath}, sharedSSTs, pebble.KeyRange{Start: r.start, End: r.end}) h.Recordf("%s // %v", r, err) } @@ -1501,6 +1626,15 @@ func (r *replicateOp) run(t *test, h historyRecorder) { return } + // First, do a RangeKeyDelete and DeleteRange on the whole span. + if err := dest.RangeKeyDelete(r.start, r.end, t.writeOpts); err != nil { + h.Recordf("%s // %v", r, err) + return + } + if err := dest.DeleteRange(r.start, r.end, t.writeOpts); err != nil { + h.Recordf("%s // %v", r, err) + return + } iter, err := source.NewIter(&pebble.IterOptions{ LowerBound: r.start, UpperBound: r.end, @@ -1511,16 +1645,7 @@ func (r *replicateOp) run(t *test, h historyRecorder) { } defer iter.Close() - // Write rangedels and rangekeydels for the range. This mimics the Excise - // that runSharedReplicate would do. - if err := w.DeleteRange(r.start, r.end); err != nil { - panic(err) - } - if err := w.RangeKeyDelete(r.start, r.end); err != nil { - panic(err) - } - - for ok := iter.SeekGE(r.start); ok && iter.Error() != nil; ok = iter.Next() { + for ok := iter.SeekGE(r.start); ok && iter.Error() == nil; ok = iter.Next() { hasPoint, hasRange := iter.HasPointAndRange() if hasPoint { val, err := iter.ValueAndErr() @@ -1534,13 +1659,25 @@ func (r *replicateOp) run(t *test, h historyRecorder) { if hasRange && iter.RangeKeyChanged() { rangeKeys := iter.RangeKeys() rkStart, rkEnd := iter.RangeBounds() + + span := &keyspan.Span{Start: rkStart, End: rkEnd, Keys: make([]keyspan.Key, len(rangeKeys))} for i := range rangeKeys { - if err := w.RangeKeySet(rkStart, rkEnd, rangeKeys[i].Suffix, rangeKeys[i].Value); err != nil { - panic(err) + span.Keys[i] = keyspan.Key{ + Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet), + Suffix: rangeKeys[i].Suffix, + Value: rangeKeys[i].Value, } } + keyspan.SortKeysByTrailer(&span.Keys) + if err := rangekey.Encode(span, w.AddRangeKey); err != nil { + panic(err) + } } } + if err := iter.Error(); err != nil { + h.Recordf("%s // %v", r, err) + return + } if err := w.Close(); err != nil { panic(err) } diff --git a/metamorphic/options.go b/metamorphic/options.go index b288c44068..394d7daf94 100644 --- a/metamorphic/options.go +++ b/metamorphic/options.go @@ -124,6 +124,16 @@ func parseOptions( return true } return true + case "TestOptions.use_shared_replicate": + opts.useSharedReplicate = true + return true + case "TestOptions.use_excise": + opts.useExcise = true + return true + case "TestOptions.efos_always_creates_iterators": + opts.efosAlwaysCreatesIters = true + opts.Opts.TestingAlwaysCreateEFOSIterators(true /* value */) + return true default: if customOptionParsers == nil { return false @@ -191,6 +201,15 @@ func optionsToString(opts *TestOptions) string { if opts.ingestSplit { fmt.Fprintf(&buf, " ingest_split=%v\n", opts.ingestSplit) } + if opts.useSharedReplicate { + fmt.Fprintf(&buf, " use_shared_replicate=%v\n", opts.useSharedReplicate) + } + if opts.useExcise { + fmt.Fprintf(&buf, " use_excise=%v\n", opts.useExcise) + } + if opts.efosAlwaysCreatesIters { + fmt.Fprintf(&buf, " efos_always_creates_iterators=%v\n", opts.efosAlwaysCreatesIters) + } for _, customOpt := range opts.CustomOpts { fmt.Fprintf(&buf, " %s=%s\n", customOpt.Name(), customOpt.Value()) } @@ -267,6 +286,16 @@ type TestOptions struct { // Enables ingest splits. Saved here for serialization as Options does not // serialize this. ingestSplit bool + // Enables operations that do excises. Note that a false value for this does + // not guarantee the lack of excises, as useSharedReplicate can also cause + // excises. However !useExcise && !useSharedReplicate can be used to guarantee + // lack of excises. + useExcise bool + // Enables EFOS to always create iterators, even if a conflicting excise + // happens. Used to guarantee EFOS determinism when conflicting excises are + // in play. If false, EFOS determinism is maintained by having the DB do a + // flush after every new EFOS. + efosAlwaysCreatesIters bool } // CustomOption defines a custom option that configures the behavior of an @@ -571,6 +600,14 @@ func randomOptions( testOpts.seedEFOS = rng.Uint64() testOpts.ingestSplit = rng.Intn(2) == 0 opts.Experimental.IngestSplit = func() bool { return testOpts.ingestSplit } + testOpts.useExcise = rng.Intn(2) == 0 + if testOpts.useExcise || testOpts.useSharedReplicate { + testOpts.efosAlwaysCreatesIters = rng.Intn(2) == 0 + opts.TestingAlwaysCreateEFOSIterators(testOpts.efosAlwaysCreatesIters) + if testOpts.Opts.FormatMajorVersion < pebble.FormatVirtualSSTables { + testOpts.Opts.FormatMajorVersion = pebble.FormatVirtualSSTables + } + } testOpts.Opts.EnsureDefaults() return testOpts } diff --git a/metamorphic/parser.go b/metamorphic/parser.go index dcc19e2929..7c456f19d1 100644 --- a/metamorphic/parser.go +++ b/metamorphic/parser.go @@ -69,6 +69,8 @@ func opArgs(op op) (receiverID *objID, targetID *objID, args []interface{}) { return &t.readerID, nil, []interface{}{&t.key} case *ingestOp: return &t.dbID, nil, []interface{}{&t.batchIDs} + case *ingestAndExciseOp: + return &t.dbID, nil, []interface{}{&t.batchID, &t.exciseStart, &t.exciseEnd} case *initOp: return nil, nil, []interface{}{&t.dbSlots, &t.batchSlots, &t.iterSlots, &t.snapshotSlots} case *iterLastOp: @@ -132,6 +134,7 @@ var methods = map[string]*methodInfo{ "Flush": makeMethod(flushOp{}, dbTag), "Get": makeMethod(getOp{}, dbTag, batchTag, snapTag), "Ingest": makeMethod(ingestOp{}, dbTag), + "IngestAndExcise": makeMethod(ingestAndExciseOp{}, dbTag), "Init": makeMethod(initOp{}, dbTag), "Last": makeMethod(iterLastOp{}, iterTag), "Merge": makeMethod(mergeOp{}, dbTag, batchTag), @@ -588,6 +591,8 @@ func computeDerivedFields(ops []op) { for i := range v.batchIDs { v.derivedDBIDs[i] = objToDB[v.batchIDs[i]] } + case *ingestAndExciseOp: + v.derivedDBID = objToDB[v.batchID] case *deleteOp: derivedDBID := v.writerID if v.writerID.tag() != dbTag { diff --git a/options.go b/options.go index 8a2c609ef0..244b101249 100644 --- a/options.go +++ b/options.go @@ -973,6 +973,11 @@ type Options struct { // against the FS are made after the DB is closed, the FS may leak a // goroutine indefinitely. fsCloser io.Closer + + // efosAlwaysCreatesIterators is set by some tests to force + // EventuallyFileOnlySnapshots to always create iterators, even after a + // conflicting excise. + efosAlwaysCreatesIterators bool } } @@ -1146,6 +1151,13 @@ func (o *Options) AddEventListener(l EventListener) { o.EventListener = &l } +// TestingAlwaysCreateEFOSIterators is used to toggle a private option for +// having EventuallyFileOnlySnapshots always create iterators. Meant to only +// be used in tests. +func (o *Options) TestingAlwaysCreateEFOSIterators(value bool) { + o.private.efosAlwaysCreatesIterators = value +} + func (o *Options) equal() Equal { if o.Comparer.Equal == nil { return bytes.Equal diff --git a/snapshot.go b/snapshot.go index 5477b54717..cdbf028d19 100644 --- a/snapshot.go +++ b/snapshot.go @@ -250,6 +250,10 @@ type EventuallyFileOnlySnapshot struct { snap *Snapshot // The wrapped version reference, if a file-only snapshot. vers *version + + // The readState corresponding to when this EFOS was created. Only set + // if alwaysCreateIters is true. + rs *readState } // Key ranges to watch for an excise on. @@ -262,6 +266,12 @@ type EventuallyFileOnlySnapshot struct { db *DB seqNum uint64 + // If true, this EventuallyFileOnlySnapshot will always generate iterators that + // retain snapshot semantics, by holding onto the readState if a conflicting + // excise were to happen. Only used in some tests to enforce deterministic + // behaviour around excises. + alwaysCreateIters bool + closed chan struct{} } @@ -282,10 +292,14 @@ func (d *DB) makeEventuallyFileOnlySnapshot( } } es := &EventuallyFileOnlySnapshot{ - db: d, - seqNum: seqNum, - protectedRanges: keyRanges, - closed: make(chan struct{}), + db: d, + seqNum: seqNum, + protectedRanges: keyRanges, + closed: make(chan struct{}), + alwaysCreateIters: d.opts.private.efosAlwaysCreatesIterators, + } + if es.alwaysCreateIters { + es.mu.rs = d.loadReadState() } if isFileOnly { es.mu.vers = d.mu.versions.currentVersion() @@ -429,6 +443,9 @@ func (es *EventuallyFileOnlySnapshot) Close() error { if es.mu.vers != nil { es.mu.vers.UnrefLocked() } + if es.mu.rs != nil { + es.mu.rs.unrefLocked() + } return nil } @@ -464,6 +481,31 @@ func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) return es.NewIterWithContext(context.Background(), o) } +func (es *EventuallyFileOnlySnapshot) newAlwaysCreateIterWithContext( + ctx context.Context, o *IterOptions, +) (*Iterator, error) { + // Grab the db mutex. This avoids races down below, where we could get + // excised between the es.excised.Load() call, and the newIter call. + es.db.mu.Lock() + defer es.db.mu.Unlock() + es.mu.Lock() + defer es.mu.Unlock() + if es.mu.vers != nil { + sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers} + return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil + } + + sOpts := snapshotIterOpts{seqNum: es.seqNum} + if es.excised.Load() { + if es.mu.rs == nil { + return nil, errors.AssertionFailedf("unexpected nil readState in EFOS' alwaysCreateIters mode") + } + sOpts.readState = es.mu.rs + } + iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o) + return iter, nil +} + // NewIterWithContext is like NewIter, and additionally accepts a context for // tracing. func (es *EventuallyFileOnlySnapshot) NewIterWithContext( @@ -475,6 +517,9 @@ func (es *EventuallyFileOnlySnapshot) NewIterWithContext( default: } + if es.alwaysCreateIters { + return es.newAlwaysCreateIterWithContext(ctx, o) + } es.mu.Lock() defer es.mu.Unlock() if es.mu.vers != nil { @@ -482,14 +527,14 @@ func (es *EventuallyFileOnlySnapshot) NewIterWithContext( return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil } + sOpts := snapshotIterOpts{seqNum: es.seqNum} if es.excised.Load() { return nil, ErrSnapshotExcised } - sOpts := snapshotIterOpts{seqNum: es.seqNum} iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o) // If excised is true, then keys relevant to the snapshot might not be - // present in the readState being used by the iterator. Error out. + // present in the readState being used by the iterator. if es.excised.Load() { iter.Close() return nil, ErrSnapshotExcised @@ -515,22 +560,10 @@ func (es *EventuallyFileOnlySnapshot) ScanInternal( if es.db == nil { panic(ErrClosed) } - if es.excised.Load() { + if es.excised.Load() && !es.alwaysCreateIters { return ErrSnapshotExcised } var sOpts snapshotIterOpts - es.mu.Lock() - if es.mu.vers != nil { - sOpts = snapshotIterOpts{ - seqNum: es.seqNum, - vers: es.mu.vers, - } - } else { - sOpts = snapshotIterOpts{ - seqNum: es.seqNum, - } - } - es.mu.Unlock() opts := &scanInternalOptions{ CategoryAndQoS: categoryAndQoS, IterOptions: IterOptions{ @@ -544,15 +577,43 @@ func (es *EventuallyFileOnlySnapshot) ScanInternal( visitSharedFile: visitSharedFile, skipSharedLevels: visitSharedFile != nil, } + if es.alwaysCreateIters { + // Grab the db mutex. This avoids races down below as it prevents excises + // from taking effect until the iterator is instantiated. + es.db.mu.Lock() + } + es.mu.Lock() + if es.mu.vers != nil { + sOpts = snapshotIterOpts{ + seqNum: es.seqNum, + vers: es.mu.vers, + } + } else { + if es.excised.Load() && es.alwaysCreateIters { + sOpts = snapshotIterOpts{ + readState: es.mu.rs, + seqNum: es.seqNum, + } + } else { + sOpts = snapshotIterOpts{ + seqNum: es.seqNum, + } + } + } + es.mu.Unlock() iter, err := es.db.newInternalIter(ctx, sOpts, opts) if err != nil { return err } defer iter.close() + if es.alwaysCreateIters { + // See the similar conditional above where we grab this mutex. + es.db.mu.Unlock() + } // If excised is true, then keys relevant to the snapshot might not be // present in the readState being used by the iterator. Error out. - if es.excised.Load() { + if es.excised.Load() && !es.alwaysCreateIters { return ErrSnapshotExcised } diff --git a/tool/manifest.go b/tool/manifest.go index fe210f5cd0..3cd9fa8cef 100644 --- a/tool/manifest.go +++ b/tool/manifest.go @@ -111,6 +111,9 @@ func (m *manifestT) printLevels(cmp base.Compare, stdout io.Writer, v *manifest. fmt.Fprintf(stdout, " %s:%d", f.FileNum, f.Size) formatSeqNumRange(stdout, f.SmallestSeqNum, f.LargestSeqNum) formatKeyRange(stdout, m.fmtKey, &f.Smallest, &f.Largest) + if f.Virtual { + fmt.Fprintf(stdout, "(virtual:backingNum=%s)", f.FileBacking.DiskFileNum) + } fmt.Fprintf(stdout, "\n") }) } @@ -125,6 +128,9 @@ func (m *manifestT) printLevels(cmp base.Compare, stdout io.Writer, v *manifest. fmt.Fprintf(stdout, " %s:%d", f.FileNum, f.Size) formatSeqNumRange(stdout, f.SmallestSeqNum, f.LargestSeqNum) formatKeyRange(stdout, m.fmtKey, &f.Smallest, &f.Largest) + if f.Virtual { + fmt.Fprintf(stdout, "(virtual:backingNum=%s)", f.FileBacking.DiskFileNum) + } fmt.Fprintf(stdout, "\n") } }