diff --git a/site/content/overview/components.md b/site/content/overview/components.md index 02c259258f..8ed8ef5a9d 100644 --- a/site/content/overview/components.md +++ b/site/content/overview/components.md @@ -17,4 +17,4 @@ M3DB is a distributed time series database that provides scalable storage and a ## M3 Aggregator -{{< fileinclude file="m3aggregator_intro.md" >}} +{{< fileinclude file="m3aggregator_intro.md" >}} \ No newline at end of file diff --git a/src/dbnode/generated/mocks/generate.go b/src/dbnode/generated/mocks/generate.go index 0c8ad0f3f5..83b5e48972 100644 --- a/src/dbnode/generated/mocks/generate.go +++ b/src/dbnode/generated/mocks/generate.go @@ -24,7 +24,7 @@ //go:generate sh -c "mockgen -package=xio $PACKAGE/src/dbnode/x/xio SegmentReader,SegmentReaderPool | genclean -pkg $PACKAGE/src/dbnode/x/xio -out $GOPATH/src/$PACKAGE/src/dbnode/x/xio/io_mock.go" //go:generate sh -c "mockgen -package=digest -destination=$GOPATH/src/$PACKAGE/src/dbnode/digest/digest_mock.go $PACKAGE/src/dbnode/digest ReaderWithDigest" //go:generate sh -c "mockgen -package=series $PACKAGE/src/dbnode/storage/series DatabaseSeries,QueryableBlockRetriever | genclean -pkg $PACKAGE/src/dbnode/storage/series -out $GOPATH/src/$PACKAGE/src/dbnode/storage/series/series_mock.go" -//go:generate sh -c "mockgen -package=lookup $PACKAGE/src/dbnode/storage/series/lookup IndexWriter | genclean -pkg $PACKAGE/src/dbnode/storage/series/lookup -out $GOPATH/src/$PACKAGE/src/dbnode/storage/series/lookup/lookup_mock.go" +//go:generate sh -c "mockgen -package=storage $PACKAGE/src/dbnode/storage IndexWriter | genclean -pkg $PACKAGE/src/dbnode/storage -out $GOPATH/src/$PACKAGE/src/dbnode/storage/lookup_mock.go" // mockgen rules for generating mocks for unexported interfaces (file mode) //go:generate sh -c "mockgen -package=encoding -destination=$GOPATH/src/$PACKAGE/src/dbnode/encoding/encoding_mock.go -source=$GOPATH/src/$PACKAGE/src/dbnode/encoding/types.go" diff --git a/src/dbnode/integration/bootstrap_retries_test.go b/src/dbnode/integration/bootstrap_retries_test.go index 41232ed245..ac5f42733e 100644 --- a/src/dbnode/integration/bootstrap_retries_test.go +++ b/src/dbnode/integration/bootstrap_retries_test.go @@ -195,7 +195,7 @@ func TestNoOpenFilesWhenBootstrapRetriesDueToObsoleteRanges(t *testing.T) { }, }) - require.NoError(t, writeTestDataToDisk(ns1, setup, seriesMaps, 0)) + require.NoError(t, writeTestDataToDiskWithIndex(ns1, setup, seriesMaps)) require.NoError(t, setup.StartServer()) // Blocks until bootstrap is complete defer func() { require.NoError(t, setup.StopServerAndVerifyOpenFilesAreClosed()) diff --git a/src/dbnode/integration/commitlog_bootstrap_coldwrites_test.go b/src/dbnode/integration/commitlog_bootstrap_coldwrites_test.go index 5f059d10bd..0d5bbdf071 100644 --- a/src/dbnode/integration/commitlog_bootstrap_coldwrites_test.go +++ b/src/dbnode/integration/commitlog_bootstrap_coldwrites_test.go @@ -85,7 +85,7 @@ func testCommitLogBootstrapColdWrites(t *testing.T, setTestOpts setTestOptions, updateInputConfig(dataFilesData) } dataFilesSeriesMaps := generate.BlocksByStart(dataFilesData) - require.NoError(t, writeTestDataToDisk(ns1, setup, dataFilesSeriesMaps, 0)) + require.NoError(t, writeTestDataToDiskWithIndex(ns1, setup, dataFilesSeriesMaps)) log.Info("finished writing data files") log.Info("writing commit logs") diff --git a/src/dbnode/integration/commitlog_bootstrap_merge_test.go b/src/dbnode/integration/commitlog_bootstrap_merge_test.go index 72898b4444..a4bb55a3e7 100644 --- a/src/dbnode/integration/commitlog_bootstrap_merge_test.go +++ b/src/dbnode/integration/commitlog_bootstrap_merge_test.go @@ -99,7 +99,7 @@ func TestCommitLogAndFSMergeBootstrap(t *testing.T) { t0: seriesMaps[t0], t1: seriesMaps[t1], } - require.NoError(t, writeTestDataToDisk(ns1, setup, fsSeriesMaps, 0)) + require.NoError(t, writeTestDataToDiskWithIndex(ns1, setup, fsSeriesMaps)) log.Info("writing commit logs") commitlogSeriesMaps := generate.SeriesBlocksByStart{ diff --git a/src/dbnode/integration/fs_bootstrap_index_test.go b/src/dbnode/integration/fs_bootstrap_index_test.go index 963b7732ed..7e398aef89 100644 --- a/src/dbnode/integration/fs_bootstrap_index_test.go +++ b/src/dbnode/integration/fs_bootstrap_index_test.go @@ -33,7 +33,9 @@ import ( "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/index" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/idx" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/context" "github.com/m3db/m3/src/x/ident" xtime "github.com/m3db/m3/src/x/time" @@ -119,16 +121,35 @@ func testFilesystemBootstrapIndexWithIndexingEnabled( ID: ident.StringID("foo"), Tags: ident.NewTags(ident.StringTag("city", "new_york"), ident.StringTag("foo", "foo")), } + fooDoc := doc.Metadata{ + ID: fooSeries.ID.Bytes(), + Fields: []doc.Field{ + {Name: []byte("city"), Value: []byte("new_york")}, + {Name: []byte("foo"), Value: []byte("foo")}, + }, + } barSeries := generate.Series{ ID: ident.StringID("bar"), Tags: ident.NewTags(ident.StringTag("city", "new_jersey")), } + barDoc := doc.Metadata{ + ID: barSeries.ID.Bytes(), + Fields: []doc.Field{ + {Name: []byte("city"), Value: []byte("new_jersey")}, + }, + } bazSeries := generate.Series{ ID: ident.StringID("baz"), Tags: ident.NewTags(ident.StringTag("city", "seattle")), } + bazDoc := doc.Metadata{ + ID: bazSeries.ID.Bytes(), + Fields: []doc.Field{ + {Name: []byte("city"), Value: []byte("seattle")}, + }, + } seriesMaps := generate.BlocksByStart([]generate.BlockConfig{ { @@ -157,8 +178,22 @@ func testFilesystemBootstrapIndexWithIndexingEnabled( }, }) + defaultIndexDocs := []doc.Metadata{ + fooDoc, + barDoc, + bazDoc, + } + require.NoError(t, writeTestDataToDisk(ns1, setup, seriesMaps, 0)) require.NoError(t, writeTestDataToDisk(ns2, setup, nil, 0)) + require.NoError(t, writeTestIndexDataToDisk( + ns1, + setup.StorageOpts(), + idxpersist.DefaultIndexVolumeType, + now.Add(-blockSize), + setup.ShardSet().AllIDs(), + defaultIndexDocs, + )) // Start the server with filesystem bootstrapper log := setup.StorageOpts().InstrumentOptions().Logger() diff --git a/src/dbnode/integration/fs_bootstrap_multi_ns_test.go b/src/dbnode/integration/fs_bootstrap_multi_ns_test.go index 9d63a1ccc5..95475bcc2f 100644 --- a/src/dbnode/integration/fs_bootstrap_multi_ns_test.go +++ b/src/dbnode/integration/fs_bootstrap_multi_ns_test.go @@ -76,8 +76,8 @@ func TestFilesystemBootstrapMultipleNamespaces(t *testing.T) { {IDs: []string{"foo", "bar"}, NumPoints: 100, Start: now.Add(-ns2BlockSize)}, {IDs: []string{"foo", "baz"}, NumPoints: 50, Start: now}, }) - require.NoError(t, writeTestDataToDisk(ns1, setup, ns1SeriesMaps, 0)) - require.NoError(t, writeTestDataToDisk(ns2, setup, ns2SeriesMaps, 0)) + require.NoError(t, writeTestDataToDiskWithIndex(ns1, setup, ns1SeriesMaps)) + require.NoError(t, writeTestDataToDiskWithIndex(ns2, setup, ns2SeriesMaps)) log.Info("generated data") // Start the server with filesystem bootstrapper diff --git a/src/dbnode/integration/generate/generate.go b/src/dbnode/integration/generate/generate.go index 4cb433ff3a..e2b4688d48 100644 --- a/src/dbnode/integration/generate/generate.go +++ b/src/dbnode/integration/generate/generate.go @@ -28,6 +28,7 @@ import ( "github.com/m3db/m3/src/dbnode/encoding/testgen" "github.com/m3db/m3/src/dbnode/ts" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/x/ident" xtime "github.com/m3db/m3/src/x/time" ) @@ -105,6 +106,25 @@ func ToPointsByTime(seriesMaps SeriesBlocksByStart) SeriesDataPointsByTime { return pointsByTime } +// ToDocMetadata converts a SeriesBlock to []doc.Metadata +func ToDocMetadata(seriesBlock SeriesBlock) []doc.Metadata { + docs := make([]doc.Metadata, 0) + for _, series := range seriesBlock { + fields := make([]doc.Field, 0) + for _, t := range series.Tags.Values() { + fields = append(fields, doc.Field{ + Name: t.Name.Bytes(), + Value: t.Value.Bytes(), + }) + } + docs = append(docs, doc.Metadata{ + ID: series.ID.Bytes(), + Fields: fields, + }) + } + return docs +} + // Dearrange de-arranges the list by the defined percent. func (l SeriesDataPointsByTime) Dearrange(percent float64) SeriesDataPointsByTime { numDis := percent * float64(len(l)) diff --git a/src/dbnode/integration/index_active_block_rotate_test.go b/src/dbnode/integration/index_active_block_rotate_test.go new file mode 100644 index 0000000000..8436c7c6da --- /dev/null +++ b/src/dbnode/integration/index_active_block_rotate_test.go @@ -0,0 +1,334 @@ +// +build integration +// +// Copyright (c) 2021 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package integration + +import ( + "fmt" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/uber-go/tally" + "go.uber.org/atomic" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "go.uber.org/zap/zaptest/observer" + + "github.com/m3db/m3/src/dbnode/integration/generate" + "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/retention" + "github.com/m3db/m3/src/dbnode/storage/index/compaction" + xclock "github.com/m3db/m3/src/x/clock" + "github.com/m3db/m3/src/x/ident" + xtime "github.com/m3db/m3/src/x/time" +) + +func TestIndexActiveBlockRotate(t *testing.T) { + var ( + testNsID = ident.StringID("testns") + numWrites = 50 + numTags = 10 + blockSize = 2 * time.Hour + indexBlockSize = blockSize * 2 + retentionPeriod = 12 * blockSize + bufferPast = 10 * time.Minute + rOpts = retention.NewOptions(). + SetRetentionPeriod(retentionPeriod). + SetBlockSize(blockSize). + SetBufferPast(bufferPast) + + idxOpts = namespace.NewIndexOptions().SetEnabled(true).SetBlockSize(indexBlockSize) + nsOpts = namespace.NewOptions(). + SetRetentionOptions(rOpts). + SetIndexOptions(idxOpts). + SetColdWritesEnabled(true) + + defaultTimeout = time.Minute + // verifyTimeout = time.Minute + ) + ns, err := namespace.NewMetadata(testNsID, nsOpts) + require.NoError(t, err) + + // Set time to next warm flushable block transition + // (i.e. align by block + bufferPast - time.Second) + currTime := time.Now().UTC() + progressTime := false + progressTimeDelta := time.Duration(0) + lockTime := sync.RWMutex{} + setTime := func(t time.Time) { + lockTime.Lock() + defer lockTime.Unlock() + progressTime = false + currTime = t.UTC() + } + setProgressTime := func() { + lockTime.Lock() + defer lockTime.Unlock() + progressTime = true + actualNow := time.Now().UTC() + progressTimeDelta = currTime.Sub(actualNow) + } + nowFn := func() time.Time { + lockTime.RLock() + at := currTime + progress := progressTime + progressDelta := progressTimeDelta + lockTime.RUnlock() + if progress { + return time.Now().UTC().Add(progressDelta) + } + return at + } + + testOpts := NewTestOptions(t). + SetNamespaces([]namespace.Metadata{ns}). + SetWriteNewSeriesAsync(true). + SetNowFn(nowFn) + + testSetup, err := NewTestSetup(t, testOpts, nil) + require.NoError(t, err) + defer testSetup.Close() + + // Write test data to disk so that there's some blocks on disk to simulate + // some index blocks already having on disk segments already in them. + require.NoError(t, testSetup.InitializeBootstrappers(InitializeBootstrappersOptions{ + WithFileSystem: true, + })) + now := testSetup.NowFn()() + fooSeries := generate.Series{ + ID: ident.StringID("foo"), + Tags: ident.NewTags(ident.StringTag("city", "new_york")), + } + barSeries := generate.Series{ + ID: ident.StringID("bar"), + Tags: ident.NewTags(ident.StringTag("city", "new_jersey")), + } + seriesMaps := generate.BlocksByStart([]generate.BlockConfig{ + { + IDs: []string{fooSeries.ID.String()}, + Tags: fooSeries.Tags, + NumPoints: 100, + Start: now.Add(-3 * blockSize), + }, + { + IDs: []string{barSeries.ID.String()}, + Tags: barSeries.Tags, + NumPoints: 100, + Start: now.Add(-3 * blockSize), + }, + }) + require.NoError(t, writeTestDataToDisk(ns, testSetup, seriesMaps, 0)) + + // Set foreground compaction planner options to force index compaction. + minCompactSize := 10 + foregroundCompactionOpts := compaction.DefaultOptions + foregroundCompactionOpts.Levels = []compaction.Level{ + { + MinSizeInclusive: 0, + MaxSizeExclusive: int64(minCompactSize), + }, + } + indexOpts := testSetup.StorageOpts().IndexOptions(). + SetForegroundCompactionPlannerOptions(foregroundCompactionOpts) + testSetup.SetStorageOpts(testSetup.StorageOpts().SetIndexOptions(indexOpts)) + + // Configure log capture + log := testSetup.StorageOpts().InstrumentOptions().Logger() + captureCore, logs := observer.New(zapcore.ErrorLevel) + zapOpt := zap.WrapCore(func(existingCore zapcore.Core) zapcore.Core { + return zapcore.NewTee(existingCore, captureCore) + }) + log = log.WithOptions(zapOpt) + + // Wire up logger. + instrumentOpts := testSetup.StorageOpts().InstrumentOptions(). + SetLogger(log) + testSetup.SetStorageOpts(testSetup.StorageOpts().SetInstrumentOptions(instrumentOpts)) + scope := testSetup.Scope() + + // Start the server. + require.NoError(t, testSetup.StartServer()) + + // Stop the server. + defer func() { + require.NoError(t, testSetup.StopServer()) + log.Debug("server is now down") + }() + + // Write test data. + session, err := testSetup.M3DBClient().DefaultSession() + require.NoError(t, err) + + var ( + metricGCSeries = "index.block.active-block.gc-series+namespace=" + testNsID.String() + metricFlushIndex = "database.flushIndex.success+namespace=" + testNsID.String() + ) + prevWarmFlushes := counterValue(t, scope, metricFlushIndex) + prevNumGCSeries := 0 + numGCSeries := counterValue(t, scope, metricGCSeries) + require.Equal(t, 0, numGCSeries) + + prevLog := log + for i := 0; i < 4; i++ { + log = prevLog.With(zap.Int("checkIteration", i)) + + // Progress to next time just before a flush and freeze (using setTime). + prevTime := nowFn() + newTime := prevTime. + Truncate(indexBlockSize). + Add(2 * indexBlockSize) + setTime(newTime) + log.Info("progressing time to before next block edge", + zap.Stringer("prevTime", prevTime), + zap.Stringer("newTime", newTime)) + + start := time.Now() + log.Info("writing test data") + + t0 := xtime.ToUnixNano(newTime.Add(-1 * (bufferPast / 2))) + t1 := xtime.ToUnixNano(newTime) + writesPeriodIter := GenerateTestIndexWrite(i, numWrites, numTags, t0, t1) + writesPeriodIter.Write(t, testNsID, session) + log.Info("test data written", zap.Duration("took", time.Since(start))) + + log.Info("waiting till data is indexed") + indexed := xclock.WaitUntil(func() bool { + indexedPeriod := writesPeriodIter.NumIndexed(t, testNsID, session) + return indexedPeriod == len(writesPeriodIter) + }, 15*time.Second) + require.True(t, indexed, + fmt.Sprintf("unexpected data indexed: actual=%d, expected=%d", + writesPeriodIter.NumIndexedWithOptions(t, testNsID, session, NumIndexedOptions{Logger: log}), + len(writesPeriodIter))) + log.Info("verified data is indexed", zap.Duration("took", time.Since(start))) + + newTime = prevTime. + Truncate(indexBlockSize). + Add(2 * indexBlockSize). + Add(bufferPast). + Add(-100 * time.Millisecond) + setTime(newTime) + log.Info("progressing time to before next flush", + zap.Stringer("prevTime", prevTime), + zap.Stringer("newTime", newTime)) + + log.Info("waiting till warm flush occurs") + + // Resume time progressing by wall clock. + setProgressTime() + + // Start checks to ensure metrics are visible the whole time. + checkFailed := atomic.NewUint64(0) + checkIndexable := func() { + numGCSeriesBefore := counterValue(t, scope, metricGCSeries) + indexedPeriod := writesPeriodIter.NumIndexed(t, testNsID, session) + numGCSeriesAfter := counterValue(t, scope, metricGCSeries) + if len(writesPeriodIter) != indexedPeriod { + assert.Equal(t, len(writesPeriodIter), indexedPeriod, + fmt.Sprintf("some metrics not indexed/visible: actual=%d, expected=%d, numGCBefore=%d, numGCAfter=%d", + writesPeriodIter.NumIndexedWithOptions(t, testNsID, session, NumIndexedOptions{Logger: log}), + len(writesPeriodIter), + numGCSeriesBefore, + numGCSeriesAfter)) + checkFailed.Inc() + } + } + + ticker := time.NewTicker(10 * time.Millisecond) + stopTickCh := make(chan struct{}) + closedTickCh := make(chan struct{}) + go func() { + defer func() { + ticker.Stop() + close(closedTickCh) + }() + + for { + select { + case <-ticker.C: + checkIndexable() + case <-stopTickCh: + return + } + } + }() + + start = time.Now() + warmFlushed := xclock.WaitUntil(func() bool { + return counterValue(t, scope, metricFlushIndex)-prevWarmFlushes > 0 + }, defaultTimeout) + counter := counterValue(t, scope, metricFlushIndex) + require.True(t, warmFlushed, + fmt.Sprintf("warm flush stats: current=%d, previous=%d", counter, prevWarmFlushes)) + log.Info("verified data has been warm flushed", zap.Duration("took", time.Since(start))) + prevWarmFlushes = counter + + start = time.Now() + log.Info("waiting for GC of series") + + expectedNumGCSeries := prevNumGCSeries + numWrites - minCompactSize + gcSeries := xclock.WaitUntil(func() bool { + // Run background compaction path to check that this path correctly + // identifies these series as "empty" post the warm flush above. + // Note: typically this path gets called from just WriteBatch calls but + // for this test we just explcitly invoke the background compact path. + for _, ns := range testSetup.DB().Namespaces() { + idx, err := ns.Index() + require.NoError(t, err) + + idx.BackgroundCompact() + } + + numGCSeries := counterValue(t, scope, metricGCSeries) + return numGCSeries >= expectedNumGCSeries + }, defaultTimeout) + numGCSeries := counterValue(t, scope, metricGCSeries) + require.True(t, gcSeries, + fmt.Sprintf("unexpected num gc series: actual=%d, expected=%d", + numGCSeries, expectedNumGCSeries)) + require.True(t, numGCSeries >= expectedNumGCSeries) + log.Info("verified series have been GC'd", zap.Duration("took", time.Since(start))) + prevNumGCSeries = numGCSeries + + require.Equal(t, 0, logs.Len(), "errors found in logs during flush/indexing") + + // Keep running indexable check for a few seconds, then progress next iter. + time.Sleep(5 * time.Second) + close(stopTickCh) + <-closedTickCh + + // Ensure check did not fail. + require.True(t, checkFailed.Load() == 0, + fmt.Sprintf("check indexable errors: %d", checkFailed.Load())) + } + + log.Info("checks passed") +} + +func counterValue(t *testing.T, r tally.TestScope, key string) int { + v, ok := r.Snapshot().Counters()[key] + require.True(t, ok) + return int(v.Value()) +} diff --git a/src/dbnode/integration/index_block_rotation_test.go b/src/dbnode/integration/index_block_rotation_test.go index 4c76f8544e..a18370ee7c 100644 --- a/src/dbnode/integration/index_block_rotation_test.go +++ b/src/dbnode/integration/index_block_rotation_test.go @@ -134,12 +134,20 @@ func TestIndexBlockRotation(t *testing.T) { log.Info("querying period0 results after expiry") // await for results to be empty. // in practice we've seen it take 11s, so make it 30s to be safe. + time.Sleep(time.Second * 4) timeout := time.Second * 30 - empty := xclock.WaitUntil(func() bool { + noData := xclock.WaitUntil(func() bool { period0Results, _, err = session.FetchTagged(ContextWithDefaultTimeout(), md.ID(), query, index.QueryOptions{StartInclusive: t0, EndExclusive: t1}) require.NoError(t, err) - return period0Results.Len() == 0 + require.True(t, period0Results.Len() == 50, "results still indexed") + + for _, i := range period0Results.Iters() { + if i.Next() { + return false + } + } + return true }, timeout) - require.True(t, empty, "results not empty after %s", timeout) + require.True(t, noData, "data still present after %s", timeout) } diff --git a/src/dbnode/integration/index_helpers.go b/src/dbnode/integration/index_helpers.go index 2077e3512c..f1d0e407b9 100644 --- a/src/dbnode/integration/index_helpers.go +++ b/src/dbnode/integration/index_helpers.go @@ -29,14 +29,16 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + "github.com/m3db/m3/src/dbnode/client" "github.com/m3db/m3/src/dbnode/encoding" "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/m3ninx/idx" + "github.com/m3db/m3/src/query/storage/m3/consolidators" "github.com/m3db/m3/src/x/ident" xtime "github.com/m3db/m3/src/x/time" - - "github.com/stretchr/testify/require" ) // TestIndexWrites holds index writes for testing. @@ -161,6 +163,21 @@ func (w TestIndexWrites) Write(t *testing.T, ns ident.ID, s client.Session) { // NumIndexed gets number of indexed series. func (w TestIndexWrites) NumIndexed(t *testing.T, ns ident.ID, s client.Session) int { + return w.NumIndexedWithOptions(t, ns, s, NumIndexedOptions{}) +} + +// NumIndexedOptions is options when performing num indexed check. +type NumIndexedOptions struct { + Logger *zap.Logger +} + +// NumIndexedWithOptions gets number of indexed series with a set of options. +func (w TestIndexWrites) NumIndexedWithOptions( + t *testing.T, + ns ident.ID, + s client.Session, + opts NumIndexedOptions, +) int { numFound := 0 for i := 0; i < len(w); i++ { wi := w[i] @@ -173,19 +190,42 @@ func (w TestIndexWrites) NumIndexed(t *testing.T, ns ident.ID, s client.Session) SeriesLimit: 10, }) if err != nil { + if l := opts.Logger; l != nil { + l.Error("fetch tagged IDs error", zap.Error(err)) + } continue } if !iter.Next() { + if l := opts.Logger; l != nil { + l.Warn("missing result", + zap.String("queryID", wi.ID.String()), + zap.ByteString("queryTags", consolidators.MustIdentTagIteratorToTags(wi.Tags, nil).ID())) + } continue } cuNs, cuID, cuTag := iter.Current() if ns.String() != cuNs.String() { + if l := opts.Logger; l != nil { + l.Warn("namespace mismatch", + zap.String("queryNamespace", ns.String()), + zap.String("resultNamespace", cuNs.String())) + } continue } if wi.ID.String() != cuID.String() { + if l := opts.Logger; l != nil { + l.Warn("id mismatch", + zap.String("queryID", wi.ID.String()), + zap.String("resultID", cuID.String())) + } continue } if !ident.NewTagIterMatcher(wi.Tags).Matches(cuTag) { + if l := opts.Logger; l != nil { + l.Warn("tag mismatch", + zap.ByteString("queryTags", consolidators.MustIdentTagIteratorToTags(wi.Tags, nil).ID()), + zap.ByteString("resultTags", consolidators.MustIdentTagIteratorToTags(cuTag, nil).ID())) + } continue } numFound++ diff --git a/src/dbnode/integration/integration.go b/src/dbnode/integration/integration.go index 244bf89cbc..41f76ac022 100644 --- a/src/dbnode/integration/integration.go +++ b/src/dbnode/integration/integration.go @@ -374,6 +374,30 @@ func NewDefaultBootstrappableTestSetups( // nolint:gocyclo } } +func writeTestDataToDiskWithIndex( + metadata namespace.Metadata, + s TestSetup, + seriesMaps generate.SeriesBlocksByStart, +) error { + if err := writeTestDataToDisk(metadata, s, seriesMaps, 0); err != nil { + return err + } + for blockStart, series := range seriesMaps { + docs := generate.ToDocMetadata(series) + if err := writeTestIndexDataToDisk( + metadata, + s.StorageOpts(), + idxpersist.DefaultIndexVolumeType, + blockStart, + s.ShardSet().AllIDs(), + docs, + ); err != nil { + return err + } + } + return nil +} + func writeTestDataToDisk( metadata namespace.Metadata, setup TestSetup, diff --git a/src/dbnode/integration/peers_bootstrap_high_concurrency_test.go b/src/dbnode/integration/peers_bootstrap_high_concurrency_test.go index 15b19eccc9..9c7e1f5c60 100644 --- a/src/dbnode/integration/peers_bootstrap_high_concurrency_test.go +++ b/src/dbnode/integration/peers_bootstrap_high_concurrency_test.go @@ -32,6 +32,7 @@ import ( "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/m3ninx/idx" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" xtime "github.com/m3db/m3/src/x/time" @@ -130,7 +131,7 @@ func testPeersBootstrapHighConcurrency( }, } numPoints := 10 - seriesMaps := generate.BlocksByStart(blockConfigs( + blockConfigs := blockConfigs( generateTaggedBlockConfigs(generateTaggedBlockConfig{ series: numSeries, numPoints: numPoints, @@ -155,10 +156,23 @@ func testPeersBootstrapHighConcurrency( commonTags: commonTags, blockStart: now, }), - )) + ) + seriesMaps := generate.BlocksByStart(blockConfigs) err = writeTestDataToDisk(namesp, setups[0], seriesMaps, 0) require.NoError(t, err) + for blockStart, series := range seriesMaps { + docs := generate.ToDocMetadata(series) + require.NoError(t, writeTestIndexDataToDisk( + namesp, + setups[0].StorageOpts(), + idxpersist.DefaultIndexVolumeType, + blockStart, + setups[0].ShardSet().AllIDs(), + docs, + )) + } + // Start the first server with filesystem bootstrapper require.NoError(t, setups[0].StartServer()) diff --git a/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go b/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go index c5b693f7c5..67e5f6e5c3 100644 --- a/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go +++ b/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go @@ -31,6 +31,7 @@ import ( "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/m3ninx/idx" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" @@ -122,6 +123,18 @@ func TestPeersBootstrapIndexAggregateQuery(t *testing.T) { }) require.NoError(t, writeTestDataToDisk(ns1, setups[0], seriesMaps, 0)) + for blockStart, series := range seriesMaps { + docs := generate.ToDocMetadata(series) + require.NoError(t, writeTestIndexDataToDisk( + ns1, + setups[0].StorageOpts(), + idxpersist.DefaultIndexVolumeType, + blockStart, + setups[0].ShardSet().AllIDs(), + docs, + )) + } + // Start the first server with filesystem bootstrapper require.NoError(t, setups[0].StartServer()) diff --git a/src/dbnode/integration/peers_bootstrap_index_test.go b/src/dbnode/integration/peers_bootstrap_index_test.go index 0a02a0943c..ba562924e9 100644 --- a/src/dbnode/integration/peers_bootstrap_index_test.go +++ b/src/dbnode/integration/peers_bootstrap_index_test.go @@ -34,6 +34,7 @@ import ( "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/m3ninx/generated/proto/fswriter" "github.com/m3db/m3/src/m3ninx/idx" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" xtime "github.com/m3db/m3/src/x/time" @@ -138,6 +139,18 @@ func TestPeersBootstrapIndexWithIndexingEnabled(t *testing.T) { }) require.NoError(t, writeTestDataToDisk(ns1, setups[0], seriesMaps, 0)) + for blockStart, series := range seriesMaps { + docs := generate.ToDocMetadata(series) + require.NoError(t, writeTestIndexDataToDisk( + ns1, + setups[0].StorageOpts(), + idxpersist.DefaultIndexVolumeType, + blockStart, + setups[0].ShardSet().AllIDs(), + docs, + )) + } + // Start the first server with filesystem bootstrapper require.NoError(t, setups[0].StartServer()) diff --git a/src/dbnode/integration/peers_bootstrap_partial_data_test.go b/src/dbnode/integration/peers_bootstrap_partial_data_test.go index 801d9dd14e..56e2610e49 100644 --- a/src/dbnode/integration/peers_bootstrap_partial_data_test.go +++ b/src/dbnode/integration/peers_bootstrap_partial_data_test.go @@ -85,7 +85,7 @@ func TestPeersBootstrapPartialData(t *testing.T) { {IDs: []string{"foo", "baz"}, NumPoints: 90, Start: now}, } seriesMaps := generate.BlocksByStart(inputData) - require.NoError(t, writeTestDataToDisk(namesp, setups[0], seriesMaps, 0)) + require.NoError(t, writeTestDataToDiskWithIndex(namesp, setups[0], seriesMaps)) // Write a subset of blocks to second node, simulating an incomplete peer bootstrap. partialBlockStarts := map[xtime.UnixNano]struct{}{ diff --git a/src/dbnode/integration/setup.go b/src/dbnode/integration/setup.go index 194f646041..ee30253df7 100644 --- a/src/dbnode/integration/setup.go +++ b/src/dbnode/integration/setup.go @@ -32,6 +32,12 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "github.com/uber-go/tally" + "github.com/uber/tchannel-go" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "github.com/m3db/m3/src/cluster/services" "github.com/m3db/m3/src/cluster/shard" "github.com/m3db/m3/src/dbnode/client" @@ -60,14 +66,9 @@ import ( "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/x/clock" "github.com/m3db/m3/src/x/ident" + "github.com/m3db/m3/src/x/instrument" xsync "github.com/m3db/m3/src/x/sync" xtime "github.com/m3db/m3/src/x/time" - - "github.com/stretchr/testify/require" - "github.com/uber-go/tally" - "github.com/uber/tchannel-go" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" ) var ( @@ -105,6 +106,7 @@ type testSetup struct { db cluster.Database storageOpts storage.Options + instrumentOpts instrument.Options serverStorageOpts server.StorageOptions fsOpts fs.Options blockLeaseManager block.LeaseManager @@ -275,8 +277,8 @@ func NewTestSetup( zap.Stringer("cache-policy", storageOpts.SeriesCachePolicy()), } logger = logger.With(fields...) - iOpts := storageOpts.InstrumentOptions() - storageOpts = storageOpts.SetInstrumentOptions(iOpts.SetLogger(logger)) + instrumentOpts := storageOpts.InstrumentOptions().SetLogger(logger) + storageOpts = storageOpts.SetInstrumentOptions(instrumentOpts) indexMode := index.InsertSync if opts.WriteNewSeriesAsync() { @@ -284,7 +286,7 @@ func NewTestSetup( } plCache, err := index.NewPostingsListCache(10, index.PostingsListCacheOptions{ - InstrumentOptions: iOpts, + InstrumentOptions: instrumentOpts, }) if err != nil { return nil, fmt.Errorf("unable to create postings list cache: %v", err) @@ -332,7 +334,8 @@ func NewTestSetup( } } - adminClient, verificationAdminClient, err := newClients(topoInit, opts, schemaReg, id, tchannelNodeAddr) + adminClient, verificationAdminClient, err := newClients(topoInit, opts, + schemaReg, id, tchannelNodeAddr, instrumentOpts) if err != nil { return nil, err } @@ -492,6 +495,7 @@ func NewTestSetup( scope: scope, storageOpts: storageOpts, blockLeaseManager: blockLeaseManager, + instrumentOpts: instrumentOpts, fsOpts: fsOpts, hostID: id, origin: newOrigin(id, tchannelNodeAddr), @@ -944,8 +948,9 @@ func (ts *testSetup) httpDebugAddr() string { func (ts *testSetup) MaybeResetClients() error { if ts.m3dbClient == nil { // Recreate the clients as their session was destroyed by StopServer() - adminClient, verificationAdminClient, err := newClients( - ts.topoInit, ts.opts, ts.schemaReg, ts.hostID, ts.tchannelNodeAddr()) + adminClient, verificationAdminClient, err := newClients(ts.topoInit, + ts.opts, ts.schemaReg, ts.hostID, ts.tchannelNodeAddr(), + ts.instrumentOpts) if err != nil { return err } @@ -1013,7 +1018,8 @@ func (ts *testSetup) InitializeBootstrappers(opts InitializeBootstrappersOptions SetIndexOptions(storageIdxOpts). SetPersistManager(persistMgr). SetIndexClaimsManager(storageOpts.IndexClaimsManager()). - SetCompactor(compactor) + SetCompactor(compactor). + SetInstrumentOptions(storageOpts.InstrumentOptions()) bs, err = bfs.NewFileSystemBootstrapperProvider(bfsOpts, bs) if err != nil { return err @@ -1049,8 +1055,8 @@ func newClients( topoInit topology.Initializer, opts TestOptions, schemaReg namespace.SchemaRegistry, - id, - tchannelNodeAddr string, + id, tchannelNodeAddr string, + instrumentOpts instrument.Options, ) (client.AdminClient, client.AdminClient, error) { var ( clientOpts = defaultClientOptions(topoInit).SetClusterConnectTimeout( @@ -1058,7 +1064,8 @@ func newClients( SetFetchRequestTimeout(opts.FetchRequestTimeout()). SetWriteConsistencyLevel(opts.WriteConsistencyLevel()). SetTopologyInitializer(topoInit). - SetUseV2BatchAPIs(true) + SetUseV2BatchAPIs(true). + SetInstrumentOptions(instrumentOpts) origin = newOrigin(id, tchannelNodeAddr) verificationOrigin = newOrigin(id+"-verification", tchannelNodeAddr) diff --git a/src/dbnode/storage/series/lookup/entry.go b/src/dbnode/storage/entry.go similarity index 73% rename from src/dbnode/storage/series/lookup/entry.go rename to src/dbnode/storage/entry.go index 2291f0f230..65c300078f 100644 --- a/src/dbnode/storage/series/lookup/entry.go +++ b/src/dbnode/storage/entry.go @@ -18,18 +18,21 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -package lookup +package storage import ( "sync" "sync/atomic" "time" + xatomic "go.uber.org/atomic" + "github.com/m3db/m3/src/dbnode/storage/block" "github.com/m3db/m3/src/dbnode/storage/bootstrap" "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/dbnode/storage/series" "github.com/m3db/m3/src/dbnode/ts/writes" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/x/clock" "github.com/m3db/m3/src/x/context" xtime "github.com/m3db/m3/src/x/time" @@ -58,8 +61,10 @@ type IndexWriter interface { // members to track lifecycle and minimize indexing overhead. // NB: users are expected to use `NewEntry` to construct these objects. type Entry struct { + Shard Shard Series series.DatabaseSeries Index uint64 + IndexGarbageCollected *xatomic.Bool indexWriter IndexWriter curReadWriters int32 reverseIndex entryIndexState @@ -67,8 +72,8 @@ type Entry struct { pendingIndexBatchSizeOne []writes.PendingIndexInsert } -// ensure Entry satisfies the `index.OnIndexSeries` interface. -var _ index.OnIndexSeries = &Entry{} +// ensure Entry satisfies the `doc.OnIndexSeries` interface. +var _ doc.OnIndexSeries = &Entry{} // ensure Entry satisfies the `bootstrap.SeriesRef` interface. var _ bootstrap.SeriesRef = &Entry{} @@ -78,6 +83,7 @@ var _ bootstrap.SeriesRefResolver = &Entry{} // NewEntryOptions supplies options for a new entry. type NewEntryOptions struct { + Shard Shard Series series.DatabaseSeries Index uint64 IndexWriter IndexWriter @@ -91,13 +97,15 @@ func NewEntry(opts NewEntryOptions) *Entry { nowFn = opts.NowFn } entry := &Entry{ + Shard: opts.Shard, Series: opts.Series, Index: opts.Index, + IndexGarbageCollected: xatomic.NewBool(false), indexWriter: opts.IndexWriter, nowFn: nowFn, pendingIndexBatchSizeOne: make([]writes.PendingIndexInsert, 1), + reverseIndex: newEntryIndexState(), } - entry.reverseIndex.states = entry.reverseIndex._staticAloc[:0] return entry } @@ -116,6 +124,14 @@ func (entry *Entry) DecrementReaderWriterCount() { atomic.AddInt32(&entry.curReadWriters, -1) } +// IndexedBlockCount returns the count of indexed block states. +func (entry *Entry) IndexedBlockCount() int { + entry.reverseIndex.RLock() + count := len(entry.reverseIndex.states) + entry.reverseIndex.RUnlock() + return count +} + // IndexedForBlockStart returns a bool to indicate if the Entry has been successfully // indexed for the given index blockstart. func (entry *Entry) IndexedForBlockStart(indexBlockStart xtime.UnixNano) bool { @@ -166,7 +182,10 @@ func (entry *Entry) NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) boo // NB(prateek): we retain the ref count on the entry while the indexing is pending, // the callback executed on the entry once the indexing is completed releases this // reference. -func (entry *Entry) OnIndexPrepare() { +func (entry *Entry) OnIndexPrepare(blockStartNanos xtime.UnixNano) { + entry.reverseIndex.Lock() + entry.reverseIndex.setAttemptWithWLock(blockStartNanos, true) + entry.reverseIndex.Unlock() entry.IncrementReaderWriterCount() } @@ -187,6 +206,73 @@ func (entry *Entry) OnIndexFinalize(blockStartNanos xtime.UnixNano) { entry.DecrementReaderWriterCount() } +// IfAlreadyIndexedMarkIndexSuccessAndFinalize marks the entry as successfully +// indexed if already indexed and returns true. Otherwise returns false. +func (entry *Entry) IfAlreadyIndexedMarkIndexSuccessAndFinalize( + blockStart xtime.UnixNano, +) bool { + successAlready := false + entry.reverseIndex.Lock() + for _, state := range entry.reverseIndex.states { + if state.success { + successAlready = true + break + } + } + if successAlready { + entry.reverseIndex.setSuccessWithWLock(blockStart) + entry.reverseIndex.setAttemptWithWLock(blockStart, false) + } + entry.reverseIndex.Unlock() + if successAlready { + // indicate the index has released held reference for provided write + entry.DecrementReaderWriterCount() + } + return successAlready +} + +// TryMarkIndexGarbageCollected checks if the entry is eligible to be garbage collected +// from the index. If so, it marks the entry as GCed and returns true. Otherwise returns false. +func (entry *Entry) TryMarkIndexGarbageCollected() bool { + return entry.checkNeedsIndexGarbageCollected(true) +} + +// NeedsIndexGarbageCollected checks if the entry is eligible to be garbage collected +// from the index. If so, it marks the entry as GCed and returns true. Otherwise returns false. +func (entry *Entry) NeedsIndexGarbageCollected() bool { + return entry.checkNeedsIndexGarbageCollected(false) +} + +func (entry *Entry) checkNeedsIndexGarbageCollected(mark bool) bool { + // Since series insertions + index insertions are done separately async, it is possible for + // a series to be in the index but not have data written yet, and so any series not in the + // lookup yet we cannot yet consider empty. + e, _, err := entry.Shard.TryRetrieveSeriesAndIncrementReaderWriterCount(entry.Series.ID()) + if err != nil || e == nil { + return false + } + defer e.DecrementReaderWriterCount() + + // Consider non-empty if the entry is still being held since this could indicate + // another thread holding a new series prior to writing to it. + if e.ReaderWriterCount() > 1 { + return false + } + + // Series must be empty to be GCed. This happens when the data and index are flushed to disk and + // so the series no longer has in-mem data. + if !e.Series.IsEmpty() { + return false + } + + if mark { + // Mark as GCed from index so the entry can be safely cleaned up elsewhere. + entry.IndexGarbageCollected.Store(true) + } + + return true +} + // Write writes a new value. func (entry *Entry) Write( ctx context.Context, @@ -243,7 +329,7 @@ func (entry *Entry) maybeIndex(timestamp xtime.UnixNano) error { }, Document: entry.Series.Metadata(), } - entry.OnIndexPrepare() + entry.OnIndexPrepare(idx.BlockStartForWriteTime(timestamp)) return idx.WritePending(entry.pendingIndexBatchSizeOne) } @@ -271,96 +357,64 @@ func (entry *Entry) ReleaseRef() error { // have a write for the 12-2p block from the 2-4p block, or we'd drop the late write. type entryIndexState struct { sync.RWMutex - states []entryIndexBlockState - - // NB(prateek): we alloc an array (not slice) of size 3, as that is - // the most we will need (only 3 blocks should ever be written to - // simultaneously in the worst case). We allocate it like we're doing - // to ensure it's along side the rest of the struct in memory. But - // we only access it through `states`, to ensure that it can be - // grown/shrunk as needed. Do not acccess it directly. - _staticAloc [3]entryIndexBlockState + states map[xtime.UnixNano]entryIndexBlockState } // entryIndexBlockState is used to capture the state of indexing for a single shard // entry for a given index block start. It's used to prevent attempts at double indexing // for the same block start. type entryIndexBlockState struct { - blockStart xtime.UnixNano - attempt bool - success bool + attempt bool + success bool +} + +func newEntryIndexState() entryIndexState { + return entryIndexState{ + states: make(map[xtime.UnixNano]entryIndexBlockState, 4), + } } func (s *entryIndexState) indexedWithRLock(t xtime.UnixNano) bool { - for i := range s.states { - if s.states[i].blockStart.Equal(t) { - return s.states[i].success - } + v, ok := s.states[t] + if ok { + return v.success } return false } func (s *entryIndexState) indexedOrAttemptedWithRLock(t xtime.UnixNano) bool { - for i := range s.states { - if s.states[i].blockStart.Equal(t) { - return s.states[i].success || s.states[i].attempt - } + v, ok := s.states[t] + if ok { + return v.success || v.attempt } return false } func (s *entryIndexState) setSuccessWithWLock(t xtime.UnixNano) { - for i := range s.states { - if s.states[i].blockStart.Equal(t) { - s.states[i].success = true - return - } + if s.indexedWithRLock(t) { + return } // NB(r): If not inserted state yet that means we need to make an insertion, // this will happen if synchronously indexing and we haven't called // NeedIndexUpdate before we indexed the series. - s.insertBlockState(entryIndexBlockState{ - blockStart: t, - success: true, - }) + s.states[t] = entryIndexBlockState{ + success: true, + } } func (s *entryIndexState) setAttemptWithWLock(t xtime.UnixNano, attempt bool) { - // first check if we have the block start in the slice already - for i := range s.states { - if s.states[i].blockStart.Equal(t) { - s.states[i].attempt = attempt - return + v, ok := s.states[t] + if ok { + if v.success { + return // Attempt is not relevant if success. } - } - - s.insertBlockState(entryIndexBlockState{ - blockStart: t, - attempt: attempt, - }) -} - -func (s *entryIndexState) insertBlockState(newState entryIndexBlockState) { - // i.e. we don't have the block start in the slice - // if we have less than 3 elements, we can just insert an element to the slice. - if len(s.states) < 3 { - s.states = append(s.states, newState) + v.attempt = attempt + s.states[t] = v return } - // i.e. len(s.states) == 3, in this case, we update the entry with the lowest block start - // as we know only 3 writes can be active at any point. Think of this as a lazy compaction. - var ( - minIdx = -1 - minBlockStart = xtime.UnixNano(maxInt64) - ) - for idx, blockState := range s.states { - if blockState.blockStart < minBlockStart { - minIdx = idx - minBlockStart = blockState.blockStart - } + s.states[t] = entryIndexBlockState{ + attempt: attempt, } - - s.states[minIdx] = newState } diff --git a/src/dbnode/storage/series/lookup/entry_blackbox_test.go b/src/dbnode/storage/entry_blackbox_test.go similarity index 90% rename from src/dbnode/storage/series/lookup/entry_blackbox_test.go rename to src/dbnode/storage/entry_blackbox_test.go index 54ffdd8333..97447e4fff 100644 --- a/src/dbnode/storage/series/lookup/entry_blackbox_test.go +++ b/src/dbnode/storage/entry_blackbox_test.go @@ -18,14 +18,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -package lookup_test +package storage import ( "sync" "testing" "time" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" xtime "github.com/m3db/m3/src/x/time" "github.com/fortytw2/leaktest" @@ -43,7 +42,7 @@ func newTime(n int) xtime.UnixNano { } func TestEntryReaderWriterCount(t *testing.T) { - e := lookup.NewEntry(lookup.NewEntryOptions{}) + e := NewEntry(NewEntryOptions{}) require.Equal(t, int32(0), e.ReaderWriterCount()) e.IncrementReaderWriterCount() @@ -54,12 +53,12 @@ func TestEntryReaderWriterCount(t *testing.T) { } func TestEntryIndexSuccessPath(t *testing.T) { - e := lookup.NewEntry(lookup.NewEntryOptions{}) + e := NewEntry(NewEntryOptions{}) t0 := newTime(0) require.False(t, e.IndexedForBlockStart(t0)) require.True(t, e.NeedsIndexUpdate(t0)) - e.OnIndexPrepare() + e.OnIndexPrepare(t0) e.OnIndexSuccess(t0) e.OnIndexFinalize(t0) @@ -69,12 +68,12 @@ func TestEntryIndexSuccessPath(t *testing.T) { } func TestEntryIndexFailPath(t *testing.T) { - e := lookup.NewEntry(lookup.NewEntryOptions{}) + e := NewEntry(NewEntryOptions{}) t0 := newTime(0) require.False(t, e.IndexedForBlockStart(t0)) require.True(t, e.NeedsIndexUpdate(t0)) - e.OnIndexPrepare() + e.OnIndexPrepare(t0) e.OnIndexFinalize(t0) require.False(t, e.IndexedForBlockStart(t0)) @@ -85,7 +84,7 @@ func TestEntryIndexFailPath(t *testing.T) { func TestEntryMultipleGoroutinesRaceIndexUpdate(t *testing.T) { defer leaktest.CheckTimeout(t, time.Second)() - e := lookup.NewEntry(lookup.NewEntryOptions{}) + e := NewEntry(NewEntryOptions{}) t0 := newTime(0) require.False(t, e.IndexedForBlockStart(t0)) diff --git a/src/dbnode/storage/series/lookup/entry_whitebox_test.go b/src/dbnode/storage/entry_whitebox_test.go similarity index 86% rename from src/dbnode/storage/series/lookup/entry_whitebox_test.go rename to src/dbnode/storage/entry_whitebox_test.go index cac980b835..34a552cfb0 100644 --- a/src/dbnode/storage/series/lookup/entry_whitebox_test.go +++ b/src/dbnode/storage/entry_whitebox_test.go @@ -18,7 +18,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -package lookup +package storage import ( "testing" @@ -35,23 +35,12 @@ import ( "github.com/stretchr/testify/require" ) -var ( - initTime = time.Date(2018, time.May, 12, 15, 55, 0, 0, time.UTC) - testBlockSize = 24 * time.Hour -) - -func newTime(n int) xtime.UnixNano { - t := initTime.Truncate(testBlockSize).Add(time.Duration(n) * testBlockSize) - return xtime.ToUnixNano(t) -} - func TestEntryIndexAttemptRotatesSlice(t *testing.T) { e := NewEntry(NewEntryOptions{}) - require.Equal(t, 3, cap(e.reverseIndex.states)) for i := 0; i < 10; i++ { ti := newTime(i) require.True(t, e.NeedsIndexUpdate(ti)) - require.Equal(t, 3, cap(e.reverseIndex.states)) + require.Equal(t, i+1, e.IndexedBlockCount()) } // ensure only the latest ones are held on to @@ -66,7 +55,9 @@ func TestEntryIndexSeriesRef(t *testing.T) { now := time.Now() blockStart := newTime(0) mockIndexWriter := NewMockIndexWriter(ctrl) - mockIndexWriter.EXPECT().BlockStartForWriteTime(blockStart).Return(blockStart) + mockIndexWriter.EXPECT().BlockStartForWriteTime(blockStart). + Return(blockStart). + Times(2) mockSeries := series.NewMockDatabaseSeries(ctrl) mockSeries.EXPECT().Metadata().Return(doc.Metadata{}) diff --git a/src/dbnode/storage/flush.go b/src/dbnode/storage/flush.go index 242c676d83..7e206ffa71 100644 --- a/src/dbnode/storage/flush.go +++ b/src/dbnode/storage/flush.go @@ -144,7 +144,7 @@ func (m *flushManager) Flush(startTime xtime.UnixNano) error { // will attempt to snapshot blocks w/ unflushed data which would be wasteful if // the block is already flushable. multiErr := xerrors.NewMultiError() - if err = m.dataWarmFlush(namespaces, startTime); err != nil { + if err := m.dataWarmFlush(namespaces, startTime); err != nil { multiErr = multiErr.Add(err) } @@ -159,7 +159,7 @@ func (m *flushManager) Flush(startTime xtime.UnixNano) error { multiErr = multiErr.Add(fmt.Errorf("error rotating commitlog in mediator tick: %v", err)) } - if err = m.indexFlush(namespaces); err != nil { + if err := m.indexFlush(namespaces); err != nil { multiErr = multiErr.Add(err) } @@ -187,8 +187,7 @@ func (m *flushManager) dataWarmFlush( multiErr = multiErr.Add(err) continue } - err = m.flushNamespaceWithTimes(ns, flushTimes, flushPersist) - if err != nil { + if err := m.flushNamespaceWithTimes(ns, flushTimes, flushPersist); err != nil { multiErr = multiErr.Add(err) } } @@ -272,7 +271,10 @@ func (m *flushManager) indexFlush( if !indexEnabled { continue } - multiErr = multiErr.Add(ns.FlushIndex(indexFlush)) + + if err := ns.FlushIndex(indexFlush); err != nil { + multiErr = multiErr.Add(err) + } } multiErr = multiErr.Add(indexFlush.DoneIndex()) diff --git a/src/dbnode/storage/flush_test.go b/src/dbnode/storage/flush_test.go index 667c1f05ff..eff5efae85 100644 --- a/src/dbnode/storage/flush_test.go +++ b/src/dbnode/storage/flush_test.go @@ -315,12 +315,16 @@ func TestFlushManagerSkipNamespaceIndexingDisabled(t *testing.T) { defer ctrl.Finish() nsOpts := defaultTestNs1Opts.SetIndexOptions(namespace.NewIndexOptions().SetEnabled(false)) + s1 := NewMockdatabaseShard(ctrl) + s2 := NewMockdatabaseShard(ctrl) ns := NewMockdatabaseNamespace(ctrl) ns.EXPECT().Options().Return(nsOpts).AnyTimes() ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(true, nil).AnyTimes() ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + s1.EXPECT().ID().Return(uint32(1)).AnyTimes() + s2.EXPECT().ID().Return(uint32(2)).AnyTimes() var ( mockFlushPersist = persist.NewMockFlushPreparer(ctrl) @@ -358,13 +362,23 @@ func TestFlushManagerNamespaceIndexingEnabled(t *testing.T) { defer ctrl.Finish() nsOpts := defaultTestNs1Opts.SetIndexOptions(namespace.NewIndexOptions().SetEnabled(true)) + s1 := NewMockdatabaseShard(ctrl) + s2 := NewMockdatabaseShard(ctrl) ns := NewMockdatabaseNamespace(ctrl) ns.EXPECT().Options().Return(nsOpts).AnyTimes() ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(true, nil).AnyTimes() - ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - ns.EXPECT().FlushIndex(gomock.Any()).Return(nil) + s1.EXPECT().ID().Return(uint32(1)).AnyTimes() + s2.EXPECT().ID().Return(uint32(2)).AnyTimes() + + // Validate that the flush state is marked as successful only AFTER all prequisite steps have been run. + // Order is important to avoid any edge case where data is GCed from memory without all flushing operations + // being completed. + gomock.InOrder( + ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes(), + ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes(), + ns.EXPECT().FlushIndex(gomock.Any()).Return(nil), + ) var ( mockFlushPersist = persist.NewMockFlushPreparer(ctrl) diff --git a/src/dbnode/storage/fs.go b/src/dbnode/storage/fs.go index c6f99366df..7c247b0a65 100644 --- a/src/dbnode/storage/fs.go +++ b/src/dbnode/storage/fs.go @@ -39,12 +39,17 @@ const ( fileOpFailed ) +type warmStatus struct { + DataFlushed fileOpStatus + IndexFlushed fileOpStatus +} + type fileOpState struct { // WarmStatus is the status of data persistence for WarmWrites only. // Each block will only be warm-flushed once, so not keeping track of a // version here is okay. This is used in the buffer Tick to determine when // a warm bucket is evictable from memory. - WarmStatus fileOpStatus + WarmStatus warmStatus // ColdVersionRetrievable keeps track of data persistence for ColdWrites only. // Each block can be cold-flushed multiple times, so this tracks which // version of the flush completed successfully. This is ultimately used in diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go index 5fb4060bf5..d1780adfea 100644 --- a/src/dbnode/storage/index.go +++ b/src/dbnode/storage/index.go @@ -137,6 +137,8 @@ type nsIndex struct { doNotIndexWithFields []doc.Field shardSet sharding.ShardSet + + activeBlock index.Block } type nsIndexState struct { @@ -160,7 +162,9 @@ type nsIndexState struct { // NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse // chronological order. This is used at query time to enforce determinism about results // returned. - blockStartsDescOrder []xtime.UnixNano + // NB(r): Reference to this slice can be safely taken for iteration purposes + // for Query(..) since it is rebuilt each time and immutable once built. + blocksDescOrderImmutable []blockAndBlockStart // shardsFilterID is set every time the shards change to correctly // only return IDs that this node owns. @@ -173,6 +177,11 @@ type nsIndexState struct { shardsAssigned map[uint32]struct{} } +type blockAndBlockStart struct { + block index.Block + blockStart xtime.UnixNano +} + // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables // are needed for each index write which already at least acquires read lock from // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected @@ -377,6 +386,15 @@ func newNamespaceIndexWithOptions( shardSet: shardSet, } + activeBlock, err := idx.newBlockFn(xtime.UnixNano(0), idx.nsMetadata, + index.BlockOptions{ActiveBlock: true}, idx.namespaceRuntimeOptsMgr, + idx.opts.IndexOptions()) + if err != nil { + return nil, idx.unableToAllocBlockInvariantError(err) + } + + idx.activeBlock = activeBlock + // Assign shard set upfront. idx.AssignShardSet(shardSet) @@ -532,13 +550,8 @@ func (i *nsIndex) reportStats() error { // iterate known blocks in a defined order of time (newest first) // for debug log ordering - for _, start := range i.state.blockStartsDescOrder { - block, ok := i.state.blocksByTime[start] - if !ok { - return i.missingBlockInvariantError(start) - } - - err := block.Stats(reporter) + for _, b := range i.state.blocksDescOrderImmutable { + err := b.block.Stats(reporter) if err == index.ErrUnableReportStatsBlockClosed { // Closed blocks are temporarily in the list still continue @@ -547,6 +560,10 @@ func (i *nsIndex) reportStats() error { return err } } + // Active block should always be open. + if err := i.activeBlock.Stats(reporter); err != nil { + return err + } // Update level stats. for _, elem := range []struct { @@ -576,7 +593,11 @@ func (i *nsIndex) BlockStartForWriteTime(writeTime xtime.UnixNano) xtime.UnixNan } func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, error) { - return i.ensureBlockPresent(blockStart) + result, err := i.ensureBlockPresent(blockStart) + if err != nil { + return nil, err + } + return result.block, nil } // NB(prateek): including the call chains leading to this point: @@ -603,6 +624,12 @@ func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, er func (i *nsIndex) WriteBatch( batch *index.WriteBatch, ) error { + // Filter anything with a pending index out before acquiring lock. + batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize() + if !batch.PendingAny() { + return nil + } + i.state.RLock() if !i.isOpenWithRLock() { i.state.RUnlock() @@ -645,6 +672,21 @@ func (i *nsIndex) WriteBatch( func (i *nsIndex) WritePending( pending []writes.PendingIndexInsert, ) error { + // Filter anything with a pending index out before acquiring lock. + incoming := pending + pending = pending[:0] + for j := range incoming { + t := i.BlockStartForWriteTime(incoming[j].Entry.Timestamp) + if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) { + continue + } + // Continue to add this element. + pending = append(pending, incoming[j]) + } + if len(pending) == 0 { + return nil + } + i.state.RLock() if !i.isOpenWithRLock() { i.state.RUnlock() @@ -755,7 +797,8 @@ func (i *nsIndex) writeBatches( if entry.OnIndexSeries.NeedsIndexUpdate(forwardEntryTimestamp) { forwardIndexEntry := entry forwardIndexEntry.Timestamp = forwardEntryTimestamp - forwardIndexEntry.OnIndexSeries.OnIndexPrepare() + t := i.BlockStartForWriteTime(forwardEntryTimestamp) + forwardIndexEntry.OnIndexSeries.OnIndexPrepare(t) forwardIndexBatch.Append(forwardIndexEntry, d) } } else { @@ -791,30 +834,14 @@ func (i *nsIndex) writeBatchForBlockStart( pending := batch.PendingEntries() numPending := len(pending) - // NB(r): Notice we acquire each lock only to take a reference to the - // block we release it so we don't block the tick, etc when we insert - // batches since writing batches can take significant time when foreground - // compaction occurs. - block, err := i.ensureBlockPresent(blockStart) - if err != nil { - batch.MarkUnmarkedEntriesError(err) - i.logger.Error("unable to write to index, dropping inserts", - zap.Time("blockStart", blockStart.ToTime()), - zap.Int("numWrites", batch.Len()), - zap.Error(err), - ) - i.metrics.asyncInsertErrors.Inc(int64(numPending)) - return - } - // Track attempted write. // Note: attemptTotal should = attemptSkip + attemptWrite. i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending)) // i.e. we have the block and the inserts, perform the writes. - result, err := block.WriteBatch(batch) + result, err := i.activeBlock.WriteBatch(batch) - // record the end to end indexing latency + // Record the end to end indexing latency. now := i.nowFn() for idx := range pending { took := now.Sub(pending[idx].EnqueuedAt) @@ -827,6 +854,14 @@ func (i *nsIndex) writeBatchForBlockStart( i.metrics.asyncInsertSuccess.Inc(n) } + // Record mutable segments count foreground/background if latest block. + if stats := result.MutableSegmentsStats; !stats.Empty() { + i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments)) + i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs)) + i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments)) + i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs)) + } + // Allow for duplicate write errors since due to re-indexing races // we may try to re-index a series more than once. if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil { @@ -863,12 +898,12 @@ func (i *nsIndex) Bootstrap( var multiErr xerrors.MultiError for blockStart, blockResults := range bootstrapResults { - block, err := i.ensureBlockPresentWithRLock(blockStart) + blockResult, err := i.ensureBlockPresentWithRLock(blockStart) if err != nil { // should never happen multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err)) continue } - if err := block.AddResults(blockResults); err != nil { + if err := blockResult.block.AddResults(blockResults); err != nil { multiErr = multiErr.Add(err) } } @@ -883,37 +918,24 @@ func (i *nsIndex) Bootstrapped() bool { return result } -func (i *nsIndex) Tick(c context.Cancellable, startTime xtime.UnixNano) (namespaceIndexTickResult, error) { - var ( - result = namespaceIndexTickResult{} - earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, startTime) - ) - - i.state.Lock() - defer func() { - i.updateBlockStartsWithLock() - i.state.Unlock() - }() +func (i *nsIndex) Tick( + c context.Cancellable, + startTime xtime.UnixNano, +) (namespaceIndexTickResult, error) { + var result namespaceIndexTickResult - result.NumBlocks = int64(len(i.state.blocksByTime)) + // First collect blocks and acquire lock to remove those that need removing + // but then release lock so can Tick and do other expensive tasks + // such as notify of sealed blocks. + tickingBlocks, multiErr := i.tickingBlocks(startTime) - var multiErr xerrors.MultiError - for blockStart, block := range i.state.blocksByTime { + result.NumBlocks = int64(tickingBlocks.totalBlocks) + for _, block := range tickingBlocks.tickingBlocks { if c.IsCancelled() { multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation) return result, multiErr.FinalError() } - // drop any blocks past the retention period - if blockStart.Before(earliestBlockStartToRetain) { - multiErr = multiErr.Add(block.Close()) - delete(i.state.blocksByTime, blockStart) - result.NumBlocksEvicted++ - result.NumBlocks-- - continue - } - - // tick any blocks we're going to retain blockTickResult, tickErr := block.Tick(c) multiErr = multiErr.Add(tickErr) result.NumSegments += blockTickResult.NumSegments @@ -921,15 +943,65 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime xtime.UnixNano) (namespa result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable result.NumTotalDocs += blockTickResult.NumDocs result.FreeMmap += blockTickResult.FreeMmap + } + + blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c) + multiErr = multiErr.Add(tickErr) + result.NumSegments += blockTickResult.NumSegments + result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped + result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable + result.NumTotalDocs += blockTickResult.NumDocs + result.FreeMmap += blockTickResult.FreeMmap + + i.metrics.tick.Inc(1) + + return result, multiErr.FinalError() +} + +type tickingBlocksResult struct { + totalBlocks int + activeBlock index.Block + tickingBlocks []index.Block +} + +func (i *nsIndex) tickingBlocks( + startTime xtime.UnixNano, +) (tickingBlocksResult, xerrors.MultiError) { + multiErr := xerrors.NewMultiError() + earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod( + i.retentionPeriod, i.blockSize, startTime) + + i.state.Lock() + activeBlock := i.activeBlock + tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime)) + defer func() { + i.updateBlockStartsWithLock() + i.state.Unlock() + }() + + for blockStart, block := range i.state.blocksByTime { + // Drop any blocks past the retention period. + if blockStart.Before(earliestBlockStartToRetain) { + multiErr = multiErr.Add(block.Close()) + delete(i.state.blocksByTime, blockStart) + continue + } + + // Tick any blocks we're going to retain, but don't tick inline here + // we'll do this out of the block. + tickingBlocks = append(tickingBlocks, block) - // seal any blocks that are sealable + // Seal any blocks that are sealable while holding lock (seal is fast). if !blockStart.After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() { multiErr = multiErr.Add(block.Seal()) - result.NumBlocksSealed++ } } - return result, multiErr.FinalError() + return tickingBlocksResult{ + totalBlocks: len(i.state.blocksByTime), + activeBlock: activeBlock, + tickingBlocks: tickingBlocks, + }, multiErr } func (i *nsIndex) WarmFlush( @@ -1002,6 +1074,12 @@ func (i *nsIndex) WarmFlush( zap.Time("blockStart", block.StartTime().ToTime()), ) } + + for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) { + for _, s := range shards { + s.MarkWarmIndexFlushStateSuccessOrError(t, err) + } + } } i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted)) return nil @@ -1019,7 +1097,9 @@ func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) { } // We only rotate cold mutable segments in phase I of cold flushing. for _, block := range flushable { - block.RotateColdMutableSegments() + if err := block.RotateColdMutableSegments(); err != nil { + return nil, err + } } // We can't immediately evict cold mutable segments so we return a callback to do so // when cold flush finishes. @@ -1032,6 +1112,45 @@ func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) { }, nil } +// WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk. +func (i *nsIndex) WarmFlushBlockStarts() []xtime.UnixNano { + flushed := make([]xtime.UnixNano, 0) + infoFiles := i.readInfoFilesAsMap() + + for blockStart := range infoFiles { + if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { + flushed = append(flushed, blockStart) + } + } + return flushed +} + +// BackgroundCompact background compacts eligible segments. +func (i *nsIndex) BackgroundCompact() { + if i.activeBlock != nil { + i.activeBlock.BackgroundCompact() + } + for _, b := range i.state.blocksByTime { + b.BackgroundCompact() + } +} + +func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano][]fs.ReadIndexInfoFileResult { + fsOpts := i.opts.CommitLogOptions().FilesystemOptions() + infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ + FilePathPrefix: fsOpts.FilePathPrefix(), + Namespace: i.nsMetadata.ID(), + ReaderBufferSize: fsOpts.InfoReaderBufferSize(), + }) + result := make(map[xtime.UnixNano][]fs.ReadIndexInfoFileResult) + for _, infoFile := range infoFiles { + t := xtime.UnixNano(infoFile.Info.BlockStart) + files := result[t] + result[t] = append(files, infoFile) + } + return result +} + func (i *nsIndex) flushableBlocks( shards []databaseShard, flushType series.WriteType, @@ -1043,12 +1162,7 @@ func (i *nsIndex) flushableBlocks( } // NB(bodu): We read index info files once here to avoid re-reading all of them // for each block. - fsOpts := i.opts.CommitLogOptions().FilesystemOptions() - infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ - FilePathPrefix: fsOpts.FilePathPrefix(), - Namespace: i.nsMetadata.ID(), - ReaderBufferSize: fsOpts.InfoReaderBufferSize(), - }) + infoFiles := i.readInfoFilesAsMap() flushable := make([]index.Block, 0, len(i.state.blocksByTime)) now := xtime.ToUnixNano(i.nowFn()) @@ -1056,12 +1170,13 @@ func (i *nsIndex) flushableBlocks( currentBlockStart := now.Truncate(i.blockSize) // Check for flushable blocks by iterating through all block starts w/in retention. for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) { - block, err := i.ensureBlockPresentWithRLock(blockStart) + blockResult, err := i.ensureBlockPresentWithRLock(blockStart) if err != nil { return nil, err } - canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart, block, shards, flushType) + canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart, + blockResult.block, shards, flushType) if err != nil { return nil, err } @@ -1069,13 +1184,13 @@ func (i *nsIndex) flushableBlocks( continue } - flushable = append(flushable, block) + flushable = append(flushable, blockResult.block) } return flushable, nil } func (i *nsIndex) canFlushBlockWithRLock( - infoFiles []fs.ReadIndexInfoFileResult, + infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, blockStart xtime.UnixNano, block index.Block, shards []databaseShard, @@ -1105,15 +1220,15 @@ func (i *nsIndex) canFlushBlockWithRLock( Debug("skipping index cold flush due to shard not bootstrapped yet") continue } - start := blockStart - end := blockStart.Add(i.blockSize) - dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize() - for t := start; t.Before(end); t = t.Add(dataBlockSize) { + + for _, t := range i.blockStartsFromIndexBlockStart(blockStart) { flushState, err := shard.FlushState(t) if err != nil { return false, err } - if flushState.WarmStatus != fileOpSuccess { + + // Skip if the data flushing failed. Data flushing precedes index flushing. + if flushState.WarmStatus.DataFlushed != fileOpSuccess { return false, nil } } @@ -1122,24 +1237,42 @@ func (i *nsIndex) canFlushBlockWithRLock( return true, nil } +// blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within +// a given index block (since index block size >= data block size) +func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano { + start := blockStart + end := blockStart.Add(i.blockSize) + dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize() + blockStarts := make([]xtime.UnixNano, 0) + for t := start; t.Before(end); t = t.Add(dataBlockSize) { + blockStarts = append(blockStarts, t) + } + return blockStarts +} + func (i *nsIndex) hasIndexWarmFlushedToDisk( - infoFiles []fs.ReadIndexInfoFileResult, + infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, blockStart xtime.UnixNano, ) bool { - var hasIndexWarmFlushedToDisk bool // NB(bodu): We consider the block to have been warm flushed if there are any // filesets on disk. This is consistent with the "has warm flushed" check in the db shard. // Shard block starts are marked as having warm flushed if an info file is successfully read from disk. - for _, f := range infoFiles { + f, ok := infoFiles[blockStart] + if !ok { + return false + } + + for _, fileInfo := range f { indexVolumeType := idxpersist.DefaultIndexVolumeType - if f.Info.IndexVolumeType != nil { - indexVolumeType = idxpersist.IndexVolumeType(f.Info.IndexVolumeType.Value) + if fileInfo.Info.IndexVolumeType != nil { + indexVolumeType = idxpersist.IndexVolumeType(fileInfo.Info.IndexVolumeType.Value) } - if f.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType { - hasIndexWarmFlushedToDisk = true + match := fileInfo.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType + if match { + return true } } - return hasIndexWarmFlushedToDisk + return false } func (i *nsIndex) flushBlock( @@ -1352,18 +1485,21 @@ func (i *nsIndex) Query( query index.Query, opts index.QueryOptions, ) (index.QueryResult, error) { - logFields := []opentracinglog.Field{ - opentracinglog.String("query", query.String()), - opentracinglog.String("namespace", i.nsMetadata.ID().String()), - opentracinglog.Int("seriesLimit", opts.SeriesLimit), - opentracinglog.Int("docsLimit", opts.DocsLimit), - xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), - xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), - } - - ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxQuery) - sp.LogFields(logFields...) + var logFields []opentracinglog.Field + ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery) defer sp.Finish() + if sampled { + // Only allocate metadata such as query string if sampling trace. + logFields = []opentracinglog.Field{ + opentracinglog.String("query", query.String()), + opentracinglog.String("namespace", i.nsMetadata.ID().String()), + opentracinglog.Int("seriesLimit", opts.SeriesLimit), + opentracinglog.Int("docsLimit", opts.DocsLimit), + xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), + xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), + } + sp.LogFields(logFields...) + } // Get results and set the namespace ID and size limit. results := i.resultsPool.Get() @@ -1378,6 +1514,7 @@ func (i *nsIndex) Query( sp.LogFields(opentracinglog.Error(err)) return index.QueryResult{}, err } + return index.QueryResult{ Results: results, Exhaustive: queryRes.exhaustive, @@ -1499,9 +1636,13 @@ func (i *nsIndex) query( newBlockIterFn newBlockIterFn, logFields []opentracinglog.Field, ) (queryResult, error) { - ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxQueryHelper) + ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper) sp.LogFields(logFields...) defer sp.Finish() + if sampled { + // Only log fields if sampled. + sp.LogFields(logFields...) + } queryRes, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn, newBlockIterFn, sp, logFields) @@ -1582,21 +1723,20 @@ func (i *nsIndex) queryWithSpan( // Enact overrides for query options opts = i.overriddenOptsForQueryWithRLock(opts) - // Retrieve blocks to query, then we can release lock + // Retrieve blocks to query, then we can release lock. // NB(r): Important not to block ticking, and other tasks by // holding the RLock during a query. - blocks, err := i.blocksForQueryWithRLock(xtime.NewRanges(xtime.Range{ + qryRange := xtime.NewRanges(xtime.Range{ Start: opts.StartInclusive, End: opts.EndExclusive, - })) + }) + // NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's + // immutable and we only create an iterator over it. + blocks := newBlocksIterStackAlloc(i.activeBlock, i.state.blocksDescOrderImmutable, qryRange) // Can now release the lock and execute the query without holding the lock. i.state.RUnlock() - if err != nil { - return queryResult{}, err - } - var ( // State contains concurrent mutable state for async execution below. state = &asyncQueryExecState{} @@ -1607,8 +1747,9 @@ func (i *nsIndex) queryWithSpan( return queryResult{}, err } - blockIters := make([]*blockIter, 0, len(blocks)) - for _, block := range blocks { + var blockIters []*blockIter + for b, ok := blocks.Next(); ok; b, ok = b.Next() { + block := b.Current() iter, err := newBlockIterFn(ctx, block, query, results) if err != nil { return queryResult{}, err @@ -1927,64 +2068,42 @@ func (i *nsIndex) overriddenOptsForQueryWithRLock( return opts } -func (i *nsIndex) blocksForQueryWithRLock(queryRange xtime.Ranges) ([]index.Block, error) { - // Chunk the query request into bounds based on applicable blocks and - // execute the requests to each of them; and merge results. - blocks := make([]index.Block, 0, len(i.state.blockStartsDescOrder)) - - // Iterate known blocks in a defined order of time (newest first) to enforce - // some determinism about the results returned. - for _, start := range i.state.blockStartsDescOrder { - // Terminate if queryRange doesn't need any more data - if queryRange.IsEmpty() { - break - } - - block, ok := i.state.blocksByTime[start] - if !ok { - // This is an invariant, should never occur if state tracking is correct. - return nil, i.missingBlockInvariantError(start) - } - - // Ensure the block has data requested by the query. - blockRange := xtime.Range{Start: block.StartTime(), End: block.EndTime()} - if !queryRange.Overlaps(blockRange) { - continue - } - - // Remove this range from the query range. - queryRange.RemoveRange(blockRange) - - blocks = append(blocks, block) - } - - return blocks, nil +type blockPresentResult struct { + block index.Block + latest bool } -func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (index.Block, error) { +func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (blockPresentResult, error) { i.state.RLock() defer i.state.RUnlock() if !i.isOpenWithRLock() { - return nil, errDbIndexUnableToWriteClosed + return blockPresentResult{}, errDbIndexUnableToWriteClosed } return i.ensureBlockPresentWithRLock(blockStart) } +func (i *nsIndex) isLatestBlockWithRLock(blockStart xtime.UnixNano) bool { + return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) +} + // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified // blockStart, allocating one if it does not. It returns the desired block, or // error if it's unable to do so. -func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (index.Block, error) { +func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (blockPresentResult, error) { // check if the current latest block matches the required block, this // is the usual path and can short circuit the rest of the logic in this // function in most cases. - if i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) { - return i.state.latestBlock, nil + if i.isLatestBlockWithRLock(blockStart) { + return blockPresentResult{ + block: i.state.latestBlock, + latest: true, + }, nil } // check if exists in the map (this can happen if the latestBlock has not // been rotated yet). if block, ok := i.state.blocksByTime[blockStart]; ok { - return block, nil + return blockPresentResult{block: block}, nil } // i.e. block start does not exist, so we have to alloc. @@ -2002,21 +2121,24 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (index. // re-check if exists in the map (another routine did the alloc) if block, ok := i.state.blocksByTime[blockStart]; ok { - return block, nil + return blockPresentResult{ + block: block, + latest: i.isLatestBlockWithRLock(blockStart), + }, nil } // ok now we know for sure we have to alloc block, err := i.newBlockFn(blockStart, i.nsMetadata, index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions()) if err != nil { // unable to allocate the block, should never happen. - return nil, i.unableToAllocBlockInvariantError(err) + return blockPresentResult{}, i.unableToAllocBlockInvariantError(err) } // NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent. // We need to seal cold blocks write away for cold writes. if !blockStart.After(i.lastSealableBlockStart(xtime.ToUnixNano(i.nowFn()))) { if err := block.Seal(); err != nil { - return nil, err + return blockPresentResult{}, err } } @@ -2025,7 +2147,11 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (index. // update ordered blockStarts slice, and latestBlock i.updateBlockStartsWithLock() - return block, nil + + return blockPresentResult{ + block: block, + latest: i.isLatestBlockWithRLock(blockStart), + }, nil } func (i *nsIndex) lastSealableBlockStart(t xtime.UnixNano) xtime.UnixNano { @@ -2039,19 +2165,29 @@ func (i *nsIndex) updateBlockStartsWithLock() { latestBlock index.Block ) - blockStarts := make([]xtime.UnixNano, 0, len(i.state.blocksByTime)) + blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1) for ts, block := range i.state.blocksByTime { if ts >= latestBlockStart { + latestBlockStart = ts latestBlock = block } - blockStarts = append(blockStarts, ts) + blocks = append(blocks, blockAndBlockStart{ + block: block, + blockStart: ts, + }) } // order in desc order (i.e. reverse chronological) - sort.Slice(blockStarts, func(i, j int) bool { - return blockStarts[i] > blockStarts[j] + sort.Slice(blocks, func(i, j int) bool { + return blocks[i].blockStart > blocks[j].blockStart }) - i.state.blockStartsDescOrder = blockStarts + + // NB(r): Important not to modify this once set since we take reference + // to this slice with an RLock, release with RUnlock and then loop over it + // during query time so it must not be altered and stay immutable. + // This is done to avoid allocating a copy of the slice at query time for + // each query. + i.state.blocksDescOrderImmutable = blocks // rotate latestBlock i.state.latestBlock = latestBlock @@ -2269,7 +2405,7 @@ func (i *nsIndex) CleanupDuplicateFileSets(activeShards []uint32) error { func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { i.state.RLock() - defer i.state.RLock() + defer i.state.RUnlock() if i.state.closed { return errDbIndexAlreadyClosed } @@ -2342,14 +2478,16 @@ func (i *nsIndex) Close() error { var multiErr xerrors.MultiError multiErr = multiErr.Add(i.state.insertQueue.Stop()) - blocks := make([]index.Block, 0, len(i.state.blocksByTime)) + blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1) for _, block := range i.state.blocksByTime { blocks = append(blocks, block) } + blocks = append(blocks, i.activeBlock) + i.activeBlock = nil i.state.latestBlock = nil i.state.blocksByTime = nil - i.state.blockStartsDescOrder = nil + i.state.blocksDescOrderImmutable = nil if i.runtimeOptsListener != nil { i.runtimeOptsListener.Close() @@ -2377,14 +2515,6 @@ func (i *nsIndex) Close() error { return multiErr.FinalError() } -func (i *nsIndex) missingBlockInvariantError(t xtime.UnixNano) error { - err := fmt.Errorf("index query did not find block %d despite seeing it in slice", t) - instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { - l.Error(err.Error()) - }) - return err -} - func (i *nsIndex) unableToAllocBlockInvariantError(err error) error { ierr := fmt.Errorf("index unable to allocate block: %v", err) instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { @@ -2394,26 +2524,32 @@ func (i *nsIndex) unableToAllocBlockInvariantError(err error) error { } type nsIndexMetrics struct { + tick tally.Counter + asyncInsertAttemptTotal tally.Counter asyncInsertAttemptSkip tally.Counter asyncInsertAttemptWrite tally.Counter - asyncInsertSuccess tally.Counter - asyncInsertErrors tally.Counter - insertAfterClose tally.Counter - queryAfterClose tally.Counter - forwardIndexHits tally.Counter - forwardIndexMisses tally.Counter - forwardIndexCounter tally.Counter - insertEndToEndLatency tally.Timer - blocksEvictedMutableSegments tally.Counter - blockMetrics nsIndexBlocksMetrics - indexingConcurrencyMin tally.Gauge - indexingConcurrencyMax tally.Gauge - indexingConcurrencyAvg tally.Gauge - flushIndexingConcurrency tally.Gauge - flushDocsNew tally.Counter - flushDocsCached tally.Counter + asyncInsertSuccess tally.Counter + asyncInsertErrors tally.Counter + insertAfterClose tally.Counter + queryAfterClose tally.Counter + forwardIndexHits tally.Counter + forwardIndexMisses tally.Counter + forwardIndexCounter tally.Counter + insertEndToEndLatency tally.Timer + blocksEvictedMutableSegments tally.Counter + blockMetrics nsIndexBlocksMetrics + indexingConcurrencyMin tally.Gauge + indexingConcurrencyMax tally.Gauge + indexingConcurrencyAvg tally.Gauge + flushIndexingConcurrency tally.Gauge + flushDocsNew tally.Counter + flushDocsCached tally.Counter + latestBlockNumSegmentsForeground tally.Gauge + latestBlockNumDocsForeground tally.Gauge + latestBlockNumSegmentsBackground tally.Gauge + latestBlockNumDocsBackground tally.Gauge loadedDocsPerQuery tally.Histogram queryExhaustiveSuccess tally.Counter @@ -2438,6 +2574,7 @@ func newNamespaceIndexMetrics( scope := iopts.MetricsScope() blocksScope := scope.SubScope("blocks") m := nsIndexMetrics{ + tick: scope.Counter("index-tick"), asyncInsertAttemptTotal: scope.Tagged(map[string]string{ "stage": "process", }).Counter(indexAttemptName), @@ -2486,6 +2623,18 @@ func newNamespaceIndexMetrics( flushDocsCached: scope.Tagged(map[string]string{ "status": "cached", }).Counter("flush-docs"), + latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{ + "segment_type": "foreground", + }).Gauge("latest-block-num-segments"), + latestBlockNumDocsForeground: scope.Tagged(map[string]string{ + "segment_type": "foreground", + }).Gauge("latest-block-num-docs"), + latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{ + "segment_type": "background", + }).Gauge("latest-block-num-segments"), + latestBlockNumDocsBackground: scope.Tagged(map[string]string{ + "segment_type": "background", + }).Gauge("latest-block-num-docs"), loadedDocsPerQuery: scope.Histogram( "loaded-docs-per-query", tally.MustMakeExponentialValueBuckets(10, 2, 16), @@ -2603,3 +2752,70 @@ func (shards dbShards) IDs() []uint32 { } return ids } + +// blocksIterStackAlloc is a stack allocated block iterator, ensuring no +// allocations per query. +type blocksIterStackAlloc struct { + activeBlock index.Block + blocks []blockAndBlockStart + queryRanges xtime.Ranges + idx int +} + +func newBlocksIterStackAlloc( + activeBlock index.Block, + blocks []blockAndBlockStart, + queryRanges xtime.Ranges, +) blocksIterStackAlloc { + return blocksIterStackAlloc{ + activeBlock: activeBlock, + blocks: blocks, + queryRanges: queryRanges, + idx: -2, + } +} + +func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) { + iter := i + + for { + iter.idx++ + if iter.idx == -1 { + // This will return the active block. + return iter, true + } + + // No more ranges to query, perform this second so that + // the in memory block always returns results. + if i.queryRanges.IsEmpty() { + return iter, false + } + + if iter.idx >= len(i.blocks) { + return iter, false + } + + block := i.blocks[iter.idx].block + + // Ensure the block has data requested by the query. + blockRange := xtime.Range{ + Start: block.StartTime(), + End: block.EndTime(), + } + if !i.queryRanges.Overlaps(blockRange) { + continue + } + + // Remove this range from the query range. + i.queryRanges.RemoveRange(blockRange) + + return iter, true + } +} + +func (i blocksIterStackAlloc) Current() index.Block { + if i.idx == -1 { + return i.activeBlock + } + return i.blocks[i.idx].block +} diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go index 0103ffdbbf..2e0acc557c 100644 --- a/src/dbnode/storage/index/block.go +++ b/src/dbnode/storage/index/block.go @@ -206,6 +206,7 @@ type blockShardRangesSegments struct { type BlockOptions struct { ForegroundCompactorMmapDocsData bool BackgroundCompactorMmapDocsData bool + ActiveBlock bool } // NewBlockFn is a new block constructor. @@ -233,7 +234,18 @@ func NewBlock( iopts := opts.InstrumentOptions() scope := iopts.MetricsScope().SubScope("index").SubScope("block") iopts = iopts.SetMetricsScope(scope) + segs := newMutableSegments( + md, + blockStart, + opts, + blockOpts, + namespaceRuntimeOptsMgr, + iopts, + ) + + coldSegs := newMutableSegments( + md, blockStart, opts, blockOpts, @@ -242,15 +254,7 @@ func NewBlock( ) // NB(bodu): The length of coldMutableSegments is always at least 1. - coldSegs := []*mutableSegments{ - newMutableSegments( - blockStart, - opts, - blockOpts, - namespaceRuntimeOptsMgr, - iopts, - ), - } + coldMutableSegments := []*mutableSegments{coldSegs} b := &block{ state: blockStateOpen, blockStart: blockStart, @@ -258,7 +262,7 @@ func NewBlock( blockSize: blockSize, blockOpts: blockOpts, mutableSegments: segs, - coldMutableSegments: coldSegs, + coldMutableSegments: coldMutableSegments, shardRangesSegmentsByVolumeType: make(shardRangesSegmentsByVolumeType), opts: opts, iopts: iopts, @@ -284,29 +288,41 @@ func (b *block) EndTime() xtime.UnixNano { return b.blockEnd } +// BackgroundCompact background compacts eligible segments. +func (b *block) BackgroundCompact() { + b.mutableSegments.BackgroundCompact() +} + func (b *block) WriteBatch(inserts *WriteBatch) (WriteBatchResult, error) { b.RLock() if !b.writesAcceptedWithRLock() { b.RUnlock() - return b.writeBatchResult(inserts, b.writeBatchErrorInvalidState(b.state)) + return b.writeBatchResult(inserts, MutableSegmentsStats{}, + b.writeBatchErrorInvalidState(b.state)) } if b.state == blockStateSealed { coldBlock := b.coldMutableSegments[len(b.coldMutableSegments)-1] b.RUnlock() - return b.writeBatchResult(inserts, coldBlock.WriteBatch(inserts)) + _, err := coldBlock.WriteBatch(inserts) + // Don't pass stats back from insertion into a cold block, + // we only care about warm mutable segments stats. + return b.writeBatchResult(inserts, MutableSegmentsStats{}, err) } b.RUnlock() - return b.writeBatchResult(inserts, b.mutableSegments.WriteBatch(inserts)) + stats, err := b.mutableSegments.WriteBatch(inserts) + return b.writeBatchResult(inserts, stats, err) } func (b *block) writeBatchResult( inserts *WriteBatch, + stats MutableSegmentsStats, err error, ) (WriteBatchResult, error) { if err == nil { inserts.MarkUnmarkedEntriesSuccess() return WriteBatchResult{ - NumSuccess: int64(inserts.Len()), + NumSuccess: int64(inserts.Len()), + MutableSegmentsStats: stats, }, nil } @@ -314,7 +330,10 @@ func (b *block) writeBatchResult( if !ok { // NB: marking all the inserts as failure, cause we don't know which ones failed. inserts.MarkUnmarkedEntriesError(err) - return WriteBatchResult{NumError: int64(inserts.Len())}, err + return WriteBatchResult{ + NumError: int64(inserts.Len()), + MutableSegmentsStats: stats, + }, err } numErr := len(partialErr.Errs()) @@ -326,8 +345,9 @@ func (b *block) writeBatchResult( // Mark all non-error inserts success, so we don't repeatedly index them. inserts.MarkUnmarkedEntriesSuccess() return WriteBatchResult{ - NumSuccess: int64(inserts.Len() - numErr), - NumError: int64(numErr), + NumSuccess: int64(inserts.Len() - numErr), + NumError: int64(numErr), + MutableSegmentsStats: stats, }, partialErr } @@ -506,7 +526,27 @@ func (b *block) queryWithSpan( } } - batch = append(batch, iter.Current()) + // Ensure that the block contains any of the relevant time segments for the query range. + doc := iter.Current() + if md, ok := doc.Metadata(); ok && md.OnIndexSeries != nil { + var ( + inBlock bool + currentBlock = opts.StartInclusive.Truncate(b.blockSize) + ) + for !inBlock { + inBlock = md.OnIndexSeries.IndexedForBlockStart(currentBlock) + currentBlock = currentBlock.Add(b.blockSize) + if !currentBlock.Before(opts.EndExclusive) { + break + } + } + + if !inBlock { + continue + } + } + + batch = append(batch, doc) if len(batch) < batchSize { continue } @@ -1083,6 +1123,12 @@ func (b *block) Stats(reporter BlockStatsReporter) error { return nil } +func (b *block) IsOpen() bool { + b.RLock() + defer b.RUnlock() + return b.state == blockStateOpen +} + func (b *block) IsSealedWithRLock() bool { return b.state == blockStateSealed } @@ -1171,16 +1217,19 @@ func (b *block) EvictColdMutableSegments() error { return nil } -func (b *block) RotateColdMutableSegments() { +func (b *block) RotateColdMutableSegments() error { b.Lock() defer b.Unlock() - b.coldMutableSegments = append(b.coldMutableSegments, newMutableSegments( + coldSegs := newMutableSegments( + b.nsMD, b.blockStart, b.opts, b.blockOpts, b.namespaceRuntimeOptsMgr, b.iopts, - )) + ) + b.coldMutableSegments = append(b.coldMutableSegments, coldSegs) + return nil } func (b *block) MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) { diff --git a/src/dbnode/storage/index/block_bench_test.go b/src/dbnode/storage/index/block_bench_test.go index 391bcbe6c2..6ad7ec22f2 100644 --- a/src/dbnode/storage/index/block_bench_test.go +++ b/src/dbnode/storage/index/block_bench_test.go @@ -115,11 +115,19 @@ func BenchmarkBlockWrite(b *testing.B) { // useless to use in benchmarks type mockOnIndexSeries struct{} -var _ OnIndexSeries = mockOnIndexSeries{} +var _ doc.OnIndexSeries = mockOnIndexSeries{} -func (m mockOnIndexSeries) OnIndexSuccess(blockStart xtime.UnixNano) {} -func (m mockOnIndexSeries) OnIndexFinalize(blockStart xtime.UnixNano) {} -func (m mockOnIndexSeries) OnIndexPrepare() {} -func (m mockOnIndexSeries) NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) bool { +func (m mockOnIndexSeries) OnIndexSuccess(_ xtime.UnixNano) {} +func (m mockOnIndexSeries) OnIndexFinalize(_ xtime.UnixNano) {} +func (m mockOnIndexSeries) OnIndexPrepare(_ xtime.UnixNano) {} +func (m mockOnIndexSeries) NeedsIndexUpdate(_ xtime.UnixNano) bool { return false } +func (m mockOnIndexSeries) DecrementReaderWriterCount() {} +func (m mockOnIndexSeries) IfAlreadyIndexedMarkIndexSuccessAndFinalize(_ xtime.UnixNano) bool { + return false +} +func (m mockOnIndexSeries) IndexedForBlockStart(_ xtime.UnixNano) bool { return false } +func (m mockOnIndexSeries) IndexedOrAttemptedAny() bool { return false } +func (m mockOnIndexSeries) TryMarkIndexGarbageCollected() bool { return false } +func (m mockOnIndexSeries) NeedsIndexGarbageCollected() bool { return false } diff --git a/src/dbnode/storage/index/block_test.go b/src/dbnode/storage/index/block_test.go index 3804163656..d92ebd040d 100644 --- a/src/dbnode/storage/index/block_test.go +++ b/src/dbnode/storage/index/block_test.go @@ -111,7 +111,7 @@ func TestBlockWriteAfterClose(t *testing.T) { require.NoError(t, err) require.NoError(t, b.Close()) - lifecycle := NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) lifecycle.EXPECT().OnIndexFinalize(blockStart) batch := NewWriteBatch(WriteBatchOptions{ @@ -160,7 +160,7 @@ func TestBlockWriteAfterSeal(t *testing.T) { require.NoError(t, err) require.NoError(t, b.Seal()) - lifecycle := NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) lifecycle.EXPECT().OnIndexFinalize(blockStart) batch := NewWriteBatch(WriteBatchOptions{ @@ -214,11 +214,11 @@ func TestBlockWrite(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) @@ -260,11 +260,11 @@ func TestBlockWriteActualSegmentPartialFailure(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) batch := NewWriteBatch(WriteBatchOptions{ @@ -321,11 +321,11 @@ func TestBlockWritePartialFailure(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) batch := NewWriteBatch(WriteBatchOptions{ @@ -1236,7 +1236,7 @@ func TestBlockNeedsMutableSegmentsEvicted(t *testing.T) { require.False(t, b.NeedsMutableSegmentsEvicted()) // perform write and ensure it says it needs eviction - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(start) h1.EXPECT().OnIndexSuccess(start) batch := NewWriteBatch(WriteBatchOptions{ @@ -1372,11 +1372,11 @@ func TestBlockE2EInsertQuery(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) @@ -1456,13 +1456,19 @@ func TestBlockE2EInsertQueryLimit(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) + h1.EXPECT().IndexedForBlockStart(blockStart). + Return(true). + AnyTimes() - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) + h1.EXPECT().IndexedForBlockStart(blockStart). + Return(true). + AnyTimes() batch := NewWriteBatch(WriteBatchOptions{ IndexBlockSize: blockSize, @@ -1490,7 +1496,11 @@ func TestBlockE2EInsertQueryLimit(t *testing.T) { ctx := context.NewBackground() queryIter, err := b.QueryIter(ctx, Query{q}) require.NoError(t, err) - err = b.QueryWithIter(ctx, QueryOptions{SeriesLimit: limit}, queryIter, results, time.Now().Add(time.Minute), + err = b.QueryWithIter(ctx, QueryOptions{ + SeriesLimit: limit, + StartInclusive: blockStart, + EndExclusive: blockStart, + }, queryIter, results, time.Now().Add(time.Minute), emptyLogFields) require.NoError(t, err) require.Equal(t, 1, results.Size()) @@ -1538,13 +1548,19 @@ func TestBlockE2EInsertAddResultsQuery(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) + h1.EXPECT().IndexedForBlockStart(blockStart). + Return(true). + AnyTimes() - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) + h2.EXPECT().IndexedForBlockStart(blockStart). + Return(true). + AnyTimes() batch := NewWriteBatch(WriteBatchOptions{ IndexBlockSize: blockSize, @@ -1582,7 +1598,10 @@ func TestBlockE2EInsertAddResultsQuery(t *testing.T) { results := NewQueryResults(nil, QueryResultsOptions{}, testOpts) queryIter, err := b.QueryIter(ctx, Query{q}) require.NoError(t, err) - err = b.QueryWithIter(ctx, QueryOptions{}, queryIter, results, time.Now().Add(time.Minute), emptyLogFields) + err = b.QueryWithIter(ctx, QueryOptions{ + StartInclusive: blockStart, + EndExclusive: blockStart, + }, queryIter, results, time.Now().Add(time.Minute), emptyLogFields) require.NoError(t, err) require.Equal(t, 2, results.Size()) @@ -1631,9 +1650,12 @@ func TestBlockE2EInsertAddResultsMergeQuery(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) + h1.EXPECT().IndexedForBlockStart(blockStart). + Return(true). + AnyTimes() batch := NewWriteBatch(WriteBatchOptions{ IndexBlockSize: blockSize, @@ -1667,7 +1689,10 @@ func TestBlockE2EInsertAddResultsMergeQuery(t *testing.T) { results := NewQueryResults(nil, QueryResultsOptions{}, testOpts) queryIter, err := b.QueryIter(ctx, Query{q}) require.NoError(t, err) - err = b.QueryWithIter(ctx, QueryOptions{}, queryIter, results, time.Now().Add(time.Minute), emptyLogFields) + err = b.QueryWithIter(ctx, QueryOptions{ + StartInclusive: blockStart, + EndExclusive: blockStart, + }, queryIter, results, time.Now().Add(time.Minute), emptyLogFields) require.NoError(t, err) require.Equal(t, 2, results.Size()) @@ -1725,12 +1750,15 @@ func TestBlockWriteBackgroundCompact(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) + // Testing compaction only, so mark GC as already running so the test is limited only to compaction. + b.mutableSegments.compact.compactingBackgroundGarbageCollect = true + // First write - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) @@ -1759,7 +1787,7 @@ func TestBlockWriteBackgroundCompact(t *testing.T) { b.Unlock() // Second write - h1 = NewMockOnIndexSeries(ctrl) + h1 = doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) @@ -1782,13 +1810,13 @@ func TestBlockWriteBackgroundCompact(t *testing.T) { {Segment: b.mutableSegments.foregroundSegments[0].Segment()}, }) require.Equal(t, 2, len(b.mutableSegments.backgroundSegments)) - require.True(t, b.mutableSegments.compact.compactingBackground) + require.True(t, b.mutableSegments.compact.compactingBackgroundStandard) b.mutableSegments.Unlock() // Wait for compaction to finish for { b.mutableSegments.RLock() - compacting := b.mutableSegments.compact.compactingBackground + compacting := b.mutableSegments.compact.compactingBackgroundStandard b.mutableSegments.RUnlock() if !compacting { break @@ -2156,15 +2184,15 @@ func TestBlockE2EInsertAggregate(t *testing.T) { b, ok := blk.(*block) require.True(t, ok) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) h2.EXPECT().OnIndexSuccess(blockStart) - h3 := NewMockOnIndexSeries(ctrl) + h3 := doc.NewMockOnIndexSeries(ctrl) h3.EXPECT().OnIndexFinalize(blockStart) h3.EXPECT().OnIndexSuccess(blockStart) diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go index b69d8908f6..dd8512aa16 100644 --- a/src/dbnode/storage/index/compaction/compactor.go +++ b/src/dbnode/storage/index/compaction/compactor.go @@ -26,6 +26,8 @@ import ( "io" "sync" + "github.com/uber-go/tally" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/index/segment" @@ -37,7 +39,10 @@ import ( ) var ( - errCompactorBuilderEmpty = errors.New("builder has no documents") + // ErrCompactorBuilderEmpty is returned when the compaction + // would result in an empty segment. + ErrCompactorBuilderEmpty = errors.New("builder has no documents") + errCompactorBuilderNil = errors.New("builder is nil") errCompactorClosed = errors.New("compactor is closed") ) @@ -96,6 +101,12 @@ func NewCompactor( }, nil } +// CompactResult is the result of a call to compact. +type CompactResult struct { + Compacted fst.Segment + SegmentMetadatas []segment.SegmentsBuilderSegmentMetadata +} + // Compact will take a set of segments and compact them into an immutable // FST segment, if there is a single mutable segment it can directly be // converted into an FST segment, otherwise an intermediary mutable segment @@ -105,21 +116,37 @@ func NewCompactor( // time. func (c *Compactor) Compact( segs []segment.Segment, + filter segment.DocumentsFilter, + filterCounter tally.Counter, reporterOptions mmap.ReporterOptions, -) (segment.Segment, error) { +) (CompactResult, error) { c.Lock() defer c.Unlock() if c.closed { - return nil, errCompactorClosed + return CompactResult{}, errCompactorClosed } c.builder.Reset() + c.builder.SetFilter(filter, filterCounter) if err := c.builder.AddSegments(segs); err != nil { - return nil, err + return CompactResult{}, err + } + + metas, err := c.builder.SegmentMetadatas() + if err != nil { + return CompactResult{}, err + } + + compacted, err := c.compactFromBuilderWithLock(c.builder, reporterOptions) + if err != nil { + return CompactResult{}, err } - return c.compactFromBuilderWithLock(c.builder, reporterOptions) + return CompactResult{ + Compacted: compacted, + SegmentMetadatas: metas, + }, nil } // CompactUsingBuilder compacts segments together using a provided segment builder. @@ -127,7 +154,7 @@ func (c *Compactor) CompactUsingBuilder( builder segment.DocumentsBuilder, segs []segment.Segment, reporterOptions mmap.ReporterOptions, -) (segment.Segment, error) { +) (fst.Segment, error) { // NB(r): Ensure only single compaction happens at a time since the buffers are // reused between runs. c.Lock() @@ -138,7 +165,7 @@ func (c *Compactor) CompactUsingBuilder( } if builder == nil { - return nil, errCompactorBuilderEmpty + return nil, errCompactorBuilderNil } if len(segs) == 0 { @@ -231,7 +258,7 @@ func (c *Compactor) CompactUsingBuilder( func (c *Compactor) compactFromBuilderWithLock( builder segment.Builder, reporterOptions mmap.ReporterOptions, -) (segment.Segment, error) { +) (fst.Segment, error) { defer func() { // Release resources regardless of result, // otherwise old compacted segments are held onto @@ -243,7 +270,7 @@ func (c *Compactor) compactFromBuilderWithLock( // runs, we need to copy the docs slice allDocs := builder.Docs() if len(allDocs) == 0 { - return nil, errCompactorBuilderEmpty + return nil, ErrCompactorBuilderEmpty } err := c.writer.Reset(builder) diff --git a/src/dbnode/storage/index/compaction/compactor_test.go b/src/dbnode/storage/index/compaction/compactor_test.go index e0cb38249c..78cc2afe4d 100644 --- a/src/dbnode/storage/index/compaction/compactor_test.go +++ b/src/dbnode/storage/index/compaction/compactor_test.go @@ -94,12 +94,12 @@ func TestCompactorSingleMutableSegment(t *testing.T) { testBuilderSegmentOptions, testFSTSegmentOptions, CompactorOptions{}) require.NoError(t, err) - compacted, err := compactor.Compact([]segment.Segment{ + result, err := compactor.Compact([]segment.Segment{ mustSeal(t, seg), - }, mmap.ReporterOptions{}) + }, nil, nil, mmap.ReporterOptions{}) require.NoError(t, err) - assertContents(t, compacted, testDocuments) + assertContents(t, result.Compacted, testDocuments) require.NoError(t, compactor.Close()) } @@ -120,12 +120,12 @@ func TestCompactorSingleMutableSegmentWithMmapDocsData(t *testing.T) { }) require.NoError(t, err) - compacted, err := compactor.Compact([]segment.Segment{ + result, err := compactor.Compact([]segment.Segment{ mustSeal(t, seg), - }, mmap.ReporterOptions{}) + }, nil, nil, mmap.ReporterOptions{}) require.NoError(t, err) - assertContents(t, compacted, testDocuments) + assertContents(t, result.Compacted, testDocuments) require.NoError(t, compactor.Close()) } @@ -147,13 +147,13 @@ func TestCompactorManySegments(t *testing.T) { testBuilderSegmentOptions, testFSTSegmentOptions, CompactorOptions{}) require.NoError(t, err) - compacted, err := compactor.Compact([]segment.Segment{ + result, err := compactor.Compact([]segment.Segment{ mustSeal(t, seg1), mustSeal(t, seg2), - }, mmap.ReporterOptions{}) + }, nil, nil, mmap.ReporterOptions{}) require.NoError(t, err) - assertContents(t, compacted, testDocuments) + assertContents(t, result.Compacted, testDocuments) require.NoError(t, compactor.Close()) } @@ -178,13 +178,13 @@ func TestCompactorCompactDuplicateIDsNoError(t *testing.T) { testBuilderSegmentOptions, testFSTSegmentOptions, CompactorOptions{}) require.NoError(t, err) - compacted, err := compactor.Compact([]segment.Segment{ + result, err := compactor.Compact([]segment.Segment{ mustSeal(t, seg1), mustSeal(t, seg2), - }, mmap.ReporterOptions{}) + }, nil, nil, mmap.ReporterOptions{}) require.NoError(t, err) - assertContents(t, compacted, testDocuments) + assertContents(t, result.Compacted, testDocuments) require.NoError(t, compactor.Close()) } diff --git a/src/dbnode/storage/index/compaction/plan.go b/src/dbnode/storage/index/compaction/plan.go index e236f62fc6..4334e2b67e 100644 --- a/src/dbnode/storage/index/compaction/plan.go +++ b/src/dbnode/storage/index/compaction/plan.go @@ -35,19 +35,11 @@ var ( var ( // DefaultLevels are the default Level(s) used for compaction options. - DefaultLevels = []Level{ // i.e. tiers for compaction [0, 262K), [262K, 1M), [1M, 4M) - Level{ + DefaultLevels = []Level{ // i.e. tiers for compaction [0, 262K) + { MinSizeInclusive: 0, MaxSizeExclusive: 1 << 18, }, - Level{ - MinSizeInclusive: 1 << 18, - MaxSizeExclusive: 1 << 20, - }, - Level{ - MinSizeInclusive: 1 << 20, - MaxSizeExclusive: 1 << 22, - }, } // DefaultOptions are the default compaction PlannerOptions. diff --git a/src/dbnode/storage/index/for_each_test.go b/src/dbnode/storage/index/for_each_test.go index b2dc532922..94e042ab0f 100644 --- a/src/dbnode/storage/index/for_each_test.go +++ b/src/dbnode/storage/index/for_each_test.go @@ -53,7 +53,7 @@ func TestWriteBatchForEachUnmarkedBatchByBlockStart(t *testing.T) { for _, n := range []int64{2, 0, 1} { batch.Append(WriteBatchEntry{ Timestamp: tn(n), - OnIndexSeries: NewMockOnIndexSeries(ctrl), + OnIndexSeries: doc.NewMockOnIndexSeries(ctrl), }, d(n)) } @@ -109,7 +109,7 @@ func TestWriteBatchForEachUnmarkedBatchByBlockStartMore(t *testing.T) { } { batch.Append(WriteBatchEntry{ Timestamp: tn(v.nTime), - OnIndexSeries: NewMockOnIndexSeries(ctrl), + OnIndexSeries: doc.NewMockOnIndexSeries(ctrl), }, d(v.nDoc)) } diff --git a/src/dbnode/storage/index/index_mock.go b/src/dbnode/storage/index/index_mock.go index 32771d4b9d..cc3e682f0f 100644 --- a/src/dbnode/storage/index/index_mock.go +++ b/src/dbnode/storage/index/index_mock.go @@ -790,79 +790,6 @@ func (mr *MockAggregateValuesPoolMockRecorder) Put(value interface{}) *gomock.Ca return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Put", reflect.TypeOf((*MockAggregateValuesPool)(nil).Put), value) } -// MockOnIndexSeries is a mock of OnIndexSeries interface. -type MockOnIndexSeries struct { - ctrl *gomock.Controller - recorder *MockOnIndexSeriesMockRecorder -} - -// MockOnIndexSeriesMockRecorder is the mock recorder for MockOnIndexSeries. -type MockOnIndexSeriesMockRecorder struct { - mock *MockOnIndexSeries -} - -// NewMockOnIndexSeries creates a new mock instance. -func NewMockOnIndexSeries(ctrl *gomock.Controller) *MockOnIndexSeries { - mock := &MockOnIndexSeries{ctrl: ctrl} - mock.recorder = &MockOnIndexSeriesMockRecorder{mock} - return mock -} - -// EXPECT returns an object that allows the caller to indicate expected use. -func (m *MockOnIndexSeries) EXPECT() *MockOnIndexSeriesMockRecorder { - return m.recorder -} - -// NeedsIndexUpdate mocks base method. -func (m *MockOnIndexSeries) NeedsIndexUpdate(indexBlockStartForWrite time0.UnixNano) bool { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "NeedsIndexUpdate", indexBlockStartForWrite) - ret0, _ := ret[0].(bool) - return ret0 -} - -// NeedsIndexUpdate indicates an expected call of NeedsIndexUpdate. -func (mr *MockOnIndexSeriesMockRecorder) NeedsIndexUpdate(indexBlockStartForWrite interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NeedsIndexUpdate", reflect.TypeOf((*MockOnIndexSeries)(nil).NeedsIndexUpdate), indexBlockStartForWrite) -} - -// OnIndexFinalize mocks base method. -func (m *MockOnIndexSeries) OnIndexFinalize(blockStart time0.UnixNano) { - m.ctrl.T.Helper() - m.ctrl.Call(m, "OnIndexFinalize", blockStart) -} - -// OnIndexFinalize indicates an expected call of OnIndexFinalize. -func (mr *MockOnIndexSeriesMockRecorder) OnIndexFinalize(blockStart interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexFinalize", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexFinalize), blockStart) -} - -// OnIndexPrepare mocks base method. -func (m *MockOnIndexSeries) OnIndexPrepare() { - m.ctrl.T.Helper() - m.ctrl.Call(m, "OnIndexPrepare") -} - -// OnIndexPrepare indicates an expected call of OnIndexPrepare. -func (mr *MockOnIndexSeriesMockRecorder) OnIndexPrepare() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexPrepare", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexPrepare)) -} - -// OnIndexSuccess mocks base method. -func (m *MockOnIndexSeries) OnIndexSuccess(blockStart time0.UnixNano) { - m.ctrl.T.Helper() - m.ctrl.Call(m, "OnIndexSuccess", blockStart) -} - -// OnIndexSuccess indicates an expected call of OnIndexSuccess. -func (mr *MockOnIndexSeriesMockRecorder) OnIndexSuccess(blockStart interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexSuccess", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexSuccess), blockStart) -} - // MockBlock is a mock of Block interface. type MockBlock struct { ctrl *gomock.Controller @@ -929,6 +856,18 @@ func (mr *MockBlockMockRecorder) AggregateWithIter(ctx, iter, opts, results, dea return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AggregateWithIter", reflect.TypeOf((*MockBlock)(nil).AggregateWithIter), ctx, iter, opts, results, deadline, logFields) } +// BackgroundCompact mocks base method. +func (m *MockBlock) BackgroundCompact() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "BackgroundCompact") +} + +// BackgroundCompact indicates an expected call of BackgroundCompact. +func (mr *MockBlockMockRecorder) BackgroundCompact() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BackgroundCompact", reflect.TypeOf((*MockBlock)(nil).BackgroundCompact)) +} + // Close mocks base method. func (m *MockBlock) Close() error { m.ctrl.T.Helper() @@ -985,6 +924,20 @@ func (mr *MockBlockMockRecorder) EvictMutableSegments() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EvictMutableSegments", reflect.TypeOf((*MockBlock)(nil).EvictMutableSegments)) } +// IsOpen mocks base method. +func (m *MockBlock) IsOpen() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsOpen") + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsOpen indicates an expected call of IsOpen. +func (mr *MockBlockMockRecorder) IsOpen() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsOpen", reflect.TypeOf((*MockBlock)(nil).IsOpen)) +} + // IsSealed mocks base method. func (m *MockBlock) IsSealed() bool { m.ctrl.T.Helper() @@ -1072,9 +1025,11 @@ func (mr *MockBlockMockRecorder) QueryWithIter(ctx, opts, iter, results, deadlin } // RotateColdMutableSegments mocks base method. -func (m *MockBlock) RotateColdMutableSegments() { +func (m *MockBlock) RotateColdMutableSegments() error { m.ctrl.T.Helper() - m.ctrl.Call(m, "RotateColdMutableSegments") + ret := m.ctrl.Call(m, "RotateColdMutableSegments") + ret0, _ := ret[0].(error) + return ret0 } // RotateColdMutableSegments indicates an expected call of RotateColdMutableSegments. diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go index f93f030ff9..fe6707681a 100644 --- a/src/dbnode/storage/index/mutable_segments.go +++ b/src/dbnode/storage/index/mutable_segments.go @@ -28,21 +28,23 @@ import ( "sync" "time" + "github.com/uber-go/tally" + "go.uber.org/zap" + "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/storage/index/compaction" "github.com/m3db/m3/src/dbnode/storage/index/segments" + "github.com/m3db/m3/src/m3ninx/doc" m3ninxindex "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/index/segment" "github.com/m3db/m3/src/m3ninx/index/segment/builder" "github.com/m3db/m3/src/m3ninx/index/segment/fst" + "github.com/m3db/m3/src/m3ninx/x" "github.com/m3db/m3/src/x/context" "github.com/m3db/m3/src/x/instrument" "github.com/m3db/m3/src/x/mmap" xresource "github.com/m3db/m3/src/x/resource" xtime "github.com/m3db/m3/src/x/time" - - "github.com/uber-go/tally" - "go.uber.org/zap" ) var ( @@ -51,6 +53,9 @@ var ( errForegroundCompactorNoPlan = errors.New("index foreground compactor failed to generate a plan") errForegroundCompactorBadPlanFirstTask = errors.New("index foreground compactor generated plan without mutable segment in first task") errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task") + + numBackgroundCompactorsStandard = 1 + numBackgroundCompactorsGarbageCollect = 1 ) type mutableSegmentsState uint @@ -71,37 +76,52 @@ type mutableSegments struct { compact mutableSegmentsCompact blockStart xtime.UnixNano + blockSize time.Duration blockOpts BlockOptions opts Options iopts instrument.Options optsListener xresource.SimpleCloser writeIndexingConcurrency int + seriesActiveFn segment.DocumentsFilter + metrics mutableSegmentsMetrics logger *zap.Logger } type mutableSegmentsMetrics struct { - foregroundCompactionPlanRunLatency tally.Timer - foregroundCompactionTaskRunLatency tally.Timer - backgroundCompactionPlanRunLatency tally.Timer - backgroundCompactionTaskRunLatency tally.Timer + foregroundCompactionPlanRunLatency tally.Timer + foregroundCompactionTaskRunLatency tally.Timer + backgroundCompactionPlanRunLatency tally.Timer + backgroundCompactionTaskRunLatency tally.Timer + activeBlockIndexNew tally.Counter + activeBlockGarbageCollectSegment tally.Counter + activeBlockGarbageCollectSeries tally.Counter + activeBlockGarbageCollectEmptySegment tally.Counter } func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics { foregroundScope := s.Tagged(map[string]string{"compaction-type": "foreground"}) backgroundScope := s.Tagged(map[string]string{"compaction-type": "background"}) + activeBlockScope := s.SubScope("active-block") return mutableSegmentsMetrics{ foregroundCompactionPlanRunLatency: foregroundScope.Timer("compaction-plan-run-latency"), foregroundCompactionTaskRunLatency: foregroundScope.Timer("compaction-task-run-latency"), backgroundCompactionPlanRunLatency: backgroundScope.Timer("compaction-plan-run-latency"), backgroundCompactionTaskRunLatency: backgroundScope.Timer("compaction-task-run-latency"), + activeBlockIndexNew: activeBlockScope.Tagged(map[string]string{ + "result_type": "new", + }).Counter("index-result"), + activeBlockGarbageCollectSegment: activeBlockScope.Counter("gc-segment"), + activeBlockGarbageCollectSeries: activeBlockScope.Counter("gc-series"), + activeBlockGarbageCollectEmptySegment: activeBlockScope.Counter("gc-empty-segment"), } } -// NewBlock returns a new Block, representing a complete reverse index for the -// duration of time specified. It is backed by one or more segments. +// newMutableSegments returns a new Block, representing a complete reverse index +// for the duration of time specified. It is backed by one or more segments. func newMutableSegments( + md namespace.Metadata, blockStart xtime.UnixNano, opts Options, blockOpts BlockOptions, @@ -110,12 +130,15 @@ func newMutableSegments( ) *mutableSegments { m := &mutableSegments{ blockStart: blockStart, + blockSize: md.Options().IndexOptions().BlockSize(), opts: opts, blockOpts: blockOpts, + compact: mutableSegmentsCompact{opts: opts, blockOpts: blockOpts}, iopts: iopts, metrics: newMutableSegmentsMetrics(iopts.MetricsScope()), logger: iopts.Logger(), } + m.seriesActiveFn = segment.DocumentsFilterFn(m.seriesActive) m.optsListener = namespaceRuntimeOptsMgr.RegisterListener(m) return m } @@ -142,27 +165,40 @@ func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptio builder.SetSortConcurrency(m.writeIndexingConcurrency) } -func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error { +func (m *mutableSegments) seriesActive(d doc.Metadata) bool { + // Filter out any documents that only were indexed for + // sealed blocks. + if d.OnIndexSeries == nil { + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + l.Error("unexpected nil for document index entry for background compact") + }) + return true + } + + return !d.OnIndexSeries.TryMarkIndexGarbageCollected() +} + +func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats, error) { m.Lock() if m.state == mutableSegmentsStateClosed { - return errMutableSegmentsAlreadyClosed + m.Unlock() + return MutableSegmentsStats{}, errMutableSegmentsAlreadyClosed } if m.compact.compactingForeground { m.Unlock() - return errUnableToWriteBlockConcurrent + return MutableSegmentsStats{}, errUnableToWriteBlockConcurrent } // Lazily allocate the segment builder and compactors. - err := m.compact.allocLazyBuilderAndCompactorsWithLock(m.writeIndexingConcurrency, - m.blockOpts, m.opts) + err := m.compact.allocLazyBuilderAndCompactorsWithLock(m.writeIndexingConcurrency) if err != nil { m.Unlock() - return err + return MutableSegmentsStats{}, err } m.compact.compactingForeground = true - builder := m.compact.segmentBuilder + segmentBuilder := m.compact.segmentBuilder m.Unlock() defer func() { @@ -172,26 +208,37 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error { m.Unlock() }() - builder.Reset() - insertResultErr := builder.InsertBatch(m3ninxindex.Batch{ - Docs: inserts.PendingDocs(), + docs := inserts.PendingDocs() + entries := inserts.PendingEntries() + + // Set the doc ref for later recall. + for i := range entries { + docs[i].OnIndexSeries = entries[i].OnIndexSeries + } + + segmentBuilder.Reset() + insertResultErr := segmentBuilder.InsertBatch(m3ninxindex.Batch{ + Docs: docs, AllowPartialUpdates: true, }) - if len(builder.Docs()) == 0 { + n := len(segmentBuilder.Docs()) + if n == 0 { // No inserts, no need to compact. - return insertResultErr + return MutableSegmentsStats{}, insertResultErr } // We inserted some documents, need to compact immediately into a // foreground segment from the segment builder before we can serve reads // from an FST segment. - err = m.foregroundCompactWithBuilder(builder) + result, err := m.foregroundCompactWithBuilder(segmentBuilder) if err != nil { - return err + return MutableSegmentsStats{}, err } + m.metrics.activeBlockIndexNew.Inc(int64(n)) + // Return result from the original insertion since compaction was successful. - return insertResultErr + return result, insertResultErr } func (m *mutableSegments) AddReaders(readers []segment.Reader) ([]segment.Reader, error) { @@ -275,14 +322,18 @@ func (m *mutableSegments) NumSegmentsAndDocs() (int64, int64) { m.RLock() defer m.RUnlock() + foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments) + backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments) + numSegments := foregroundNumSegments + backgroundNumSegments + numDocs := foregroundNumDocs + backgroundNumDocs + return numSegments, numDocs +} + +func numSegmentsAndDocs(segs []*readableSeg) (int64, int64) { var ( numSegments, numDocs int64 ) - for _, seg := range m.foregroundSegments { - numSegments++ - numDocs += seg.Segment().Size() - } - for _, seg := range m.backgroundSegments { + for _, seg := range segs { numSegments++ numDocs += seg.Segment().Size() } @@ -326,13 +377,30 @@ func (m *mutableSegments) Close() { } func (m *mutableSegments) maybeBackgroundCompactWithLock() { - if m.compact.compactingBackground { + if m.compact.compactingBackgroundStandard { return } + m.backgroundCompactWithLock() +} + +// BackgroundCompact background compacts eligible segments. +func (m *mutableSegments) BackgroundCompact() { + m.Lock() + defer m.Unlock() + + m.backgroundCompactWithLock() +} + +func (m *mutableSegments) backgroundCompactWithLock() { // Create a logical plan. segs := make([]compaction.Segment, 0, len(m.backgroundSegments)) for _, seg := range m.backgroundSegments { + if seg.garbageCollecting { + // Do not try to compact something that we are background + // garbage collecting documents from (that have been phased out). + continue + } segs = append(segs, compaction.Segment{ Age: seg.Age(), Size: seg.Segment().Size(), @@ -349,20 +417,143 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() { return } - if len(plan.Tasks) == 0 { - return + var ( + gcRequired = false + gcPlan = &compaction.Plan{} + gcAlreadyRunning = m.compact.compactingBackgroundGarbageCollect + ) + if !gcAlreadyRunning { + gcRequired = true + + for _, seg := range m.backgroundSegments { + alreadyHasTask := false + for _, task := range plan.Tasks { + for _, taskSegment := range task.Segments { + if taskSegment.Segment == seg.Segment() { + alreadyHasTask = true + break + } + } + } + if alreadyHasTask { + // Skip needing to check if segment needs filtering. + continue + } + + // Ensure that segment has some series that need to be GC'd. + hasAnyInactiveSeries, err := m.segmentAnyInactiveSeries(seg.Segment()) + if err != nil { + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + l.Error("error detecting needs background gc segment", zap.Error(err)) + }) + continue + } + if !hasAnyInactiveSeries { + // Skip background GC since all series are still active and no + // series need to be removed. + continue + } + + // The active block starts are outdated, need to compact + // and remove any old data from the segment. + var task compaction.Task + if len(gcPlan.Tasks) > 0 { + task = gcPlan.Tasks[0] + } + + task.Segments = append(task.Segments, compaction.Segment{ + Age: seg.Age(), + Size: seg.Segment().Size(), + Type: segments.FSTType, + Segment: seg.Segment(), + }) + + if len(gcPlan.Tasks) == 0 { + gcPlan.Tasks = make([]compaction.Task, 1) + } + gcPlan.Tasks[0] = task + + // Mark as not-compactable for standard compactions + // since this will be async compacted into a smaller + // segment. + seg.garbageCollecting = true + } } - // Kick off compaction. - m.compact.compactingBackground = true - go func() { - m.backgroundCompactWithPlan(plan) + if len(plan.Tasks) != 0 { + // Kick off compaction. + m.compact.compactingBackgroundStandard = true + go func() { + m.backgroundCompactWithPlan(plan, m.compact.backgroundCompactors, gcRequired) + + m.Lock() + m.compact.compactingBackgroundStandard = false + m.cleanupBackgroundCompactWithLock() + m.Unlock() + }() + } - m.Lock() - m.compact.compactingBackground = false - m.cleanupBackgroundCompactWithLock() - m.Unlock() + if len(gcPlan.Tasks) != 0 { + // Run non-GC tasks separately so the standard loop is not blocked. + m.compact.compactingBackgroundGarbageCollect = true + go func() { + compactors, err := m.compact.allocBackgroundCompactorsGarbageCollect() + if err != nil { + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + l.Error("error background gc segments", zap.Error(err)) + }) + } else { + m.backgroundCompactWithPlan(gcPlan, compactors, gcRequired) + m.closeCompactors(compactors) + } + + m.Lock() + m.compact.compactingBackgroundGarbageCollect = false + m.cleanupBackgroundCompactWithLock() + m.Unlock() + }() + } +} + +func (m *mutableSegments) segmentAnyInactiveSeries(seg segment.Segment) (bool, error) { + reader, err := seg.Reader() + if err != nil { + return false, err + } + + defer func() { + _ = reader.Close() }() + + docs, err := reader.AllDocs() + if err != nil { + return false, err + } + + docsCloser := x.NewSafeCloser(docs) + defer func() { + // In case of early return cleanup + _ = docsCloser.Close() + }() + + var result bool + for docs.Next() { + d := docs.Current() + indexEntry := d.OnIndexSeries + if indexEntry == nil { + return false, fmt.Errorf("document has no index entry: %s", d.ID) + } + if indexEntry.NeedsIndexGarbageCollected() { + result = true + break + } + } + + if err := docs.Err(); err != nil { + return false, err + } + + return result, docsCloser.Close() } func (m *mutableSegments) shouldEvictCompactedSegmentsWithLock() bool { @@ -387,16 +578,26 @@ func (m *mutableSegments) cleanupBackgroundCompactWithLock() { m.backgroundSegments = nil // Free compactor resources. - if m.compact.backgroundCompactor == nil { + if m.compact.backgroundCompactors == nil { return } - if err := m.compact.backgroundCompactor.Close(); err != nil { + m.closeCompactors(m.compact.backgroundCompactors) + m.compact.backgroundCompactors = nil +} + +func (m *mutableSegments) closeCompactors(compactors chan *compaction.Compactor) { + close(compactors) + for compactor := range compactors { + err := compactor.Close() + if err == nil { + continue + } + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { l.Error("error closing index block background compactor", zap.Error(err)) }) } - m.compact.backgroundCompactor = nil } func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg) { @@ -410,7 +611,11 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg } } -func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) { +func (m *mutableSegments) backgroundCompactWithPlan( + plan *compaction.Plan, + compactors chan *compaction.Compactor, + gcRequired bool, +) { sw := m.metrics.backgroundCompactionPlanRunLatency.Start() defer sw.Stop() @@ -435,20 +640,38 @@ func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) { } } + var wg sync.WaitGroup for i, task := range plan.Tasks { - err := m.backgroundCompactWithTask(task, log, - logger.With(zap.Int("task", i))) - if err != nil { - instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { - l.Error("error compacting segments", zap.Error(err)) - }) - return - } + i, task := i, task + wg.Add(1) + compactor := <-compactors + go func() { + defer func() { + compactors <- compactor + wg.Done() + }() + err := m.backgroundCompactWithTask(task, compactor, gcRequired, + log, logger.With(zap.Int("task", i))) + if err != nil { + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + l.Error("error compacting segments", zap.Error(err)) + }) + } + }() } + + wg.Wait() +} + +func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) *ReadThroughSegment { + readThroughOpts := m.opts.ReadThroughSegmentOptions() + return NewReadThroughSegment(seg, m.opts.PostingsListCache(), readThroughOpts) } func (m *mutableSegments) backgroundCompactWithTask( task compaction.Task, + compactor *compaction.Compactor, + gcRequired bool, log bool, logger *zap.Logger, ) error { @@ -461,13 +684,21 @@ func (m *mutableSegments) backgroundCompactWithTask( segments = append(segments, seg.Segment) } + var documentsFilter segment.DocumentsFilter + if gcRequired { + // Only actively filter out documents if GC is required. + documentsFilter = m.seriesActiveFn + } + start := time.Now() - compacted, err := m.compact.backgroundCompactor.Compact(segments, mmap.ReporterOptions{ - Context: mmap.Context{ - Name: mmapIndexBlockName, - }, - Reporter: m.opts.MmapReporter(), - }) + compactResult, err := compactor.Compact(segments, documentsFilter, + m.metrics.activeBlockGarbageCollectSeries, + mmap.ReporterOptions{ + Context: mmap.Context{ + Name: mmapIndexBlockName, + }, + Reporter: m.opts.MmapReporter(), + }) took := time.Since(start) m.metrics.backgroundCompactionTaskRunLatency.Record(took) @@ -475,19 +706,46 @@ func (m *mutableSegments) backgroundCompactWithTask( logger.Debug("done compaction task", zap.Duration("took", took)) } + // Check if result would have resulted in an empty segment. + empty := errors.Is(err, compaction.ErrCompactorBuilderEmpty) + if empty { + // Don't return the error since we need to remove the old segments + // by calling addCompactedSegmentFromSegmentsWithLock. + err = nil + } if err != nil { return err } - // Add a read through cache for repeated expensive queries against - // background compacted segments since they can live for quite some - // time and accrue a large set of documents. - if immSeg, ok := compacted.(segment.ImmutableSegment); ok { - var ( - plCache = m.opts.PostingsListCache() - readThroughOpts = m.opts.ReadThroughSegmentOptions() - ) - compacted = NewReadThroughSegment(immSeg, plCache, readThroughOpts) + var ( + compacted = compactResult.Compacted + // segMetas = compactResult.SegmentMetadatas + replaceSeg segment.Segment + ) + if empty { + m.metrics.activeBlockGarbageCollectEmptySegment.Inc(1) + } else { + m.metrics.activeBlockGarbageCollectSegment.Inc(1) + + // Add a read through cache for repeated expensive queries against + // background compacted segments since they can live for quite some + // time and accrue a large set of documents. + readThroughSeg := m.newReadThroughSegment(compacted) + replaceSeg = readThroughSeg + + // NB(r): Before replacing the old segments with the compacted segment + // we rebuild all the cached postings lists that the previous segment had + // to avoid latency spikes during segment rotation. + // Note: There was very obvious peaks of latency (p99 of <500ms spiking + // to 8 times that at first replace of large segments after a block + // rotation) without this optimization. + + // TODO: port populating cached searches + // if err := m.populateCachedSearches(readThroughSeg, segMetas); err != nil { + // instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + // l.Error("failed to populate cached searches", zap.Error(err)) + // }) + // } } // Rotate out the replaced frozen segments and add the compacted one. @@ -495,7 +753,7 @@ func (m *mutableSegments) backgroundCompactWithTask( defer m.Unlock() result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments, - segments, compacted) + segments, replaceSeg) m.backgroundSegments = result return nil @@ -531,11 +789,17 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock( } } + if compacted == nil { + return result + } + // Return all the ones we kept plus the new compacted segment return append(result, newReadableSeg(compacted, m.opts)) } -func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.DocumentsBuilder) error { +func (m *mutableSegments) foregroundCompactWithBuilder( + builder segment.DocumentsBuilder, +) (MutableSegmentsStats, error) { // We inserted some documents, need to compact immediately into a // foreground segment. m.Lock() @@ -560,18 +824,18 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents plan, err := compaction.NewPlan(segs, m.opts.ForegroundCompactionPlannerOptions()) if err != nil { - return err + return MutableSegmentsStats{}, err } // Check plan if len(plan.Tasks) == 0 { // Should always generate a task when a mutable builder is passed to planner - return errForegroundCompactorNoPlan + return MutableSegmentsStats{}, errForegroundCompactorNoPlan } if taskNumBuilders(plan.Tasks[0]) != 1 { // First task of plan must include the builder, so we can avoid resetting it // for the first task, but then safely reset it in consequent tasks - return errForegroundCompactorBadPlanFirstTask + return MutableSegmentsStats{}, errForegroundCompactorBadPlanFirstTask } // Move any unused segments to the background. @@ -605,11 +869,10 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents defer sw.Stop() // Run the first task, without resetting the builder. - if err := m.foregroundCompactWithTask( - builder, plan.Tasks[0], - log, logger.With(zap.Int("task", 0)), - ); err != nil { - return err + result, err := m.foregroundCompactWithTask(builder, plan.Tasks[0], + log, logger.With(zap.Int("task", 0))) + if err != nil { + return result, err } // Now run each consequent task, resetting the builder each time since @@ -619,19 +882,18 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents task := plan.Tasks[i] if taskNumBuilders(task) > 0 { // Only the first task should compact the builder - return errForegroundCompactorBadPlanSecondaryTask + return result, errForegroundCompactorBadPlanSecondaryTask } // Now use the builder after resetting it. builder.Reset() - if err := m.foregroundCompactWithTask( - builder, task, - log, logger.With(zap.Int("task", i)), - ); err != nil { - return err + result, err = m.foregroundCompactWithTask(builder, task, + log, logger.With(zap.Int("task", i))) + if err != nil { + return result, err } } - return nil + return result, nil } func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock( @@ -640,7 +902,7 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock( if len(segments) == 0 { return } - if m.compact.backgroundCompactor == nil { + if m.compact.backgroundCompactors == nil { // No longer performing background compaction due to evict/close. return } @@ -680,7 +942,7 @@ func (m *mutableSegments) foregroundCompactWithTask( task compaction.Task, log bool, logger *zap.Logger, -) error { +) (MutableSegmentsStats, error) { if log { logger.Debug("start compaction task") } @@ -708,18 +970,34 @@ func (m *mutableSegments) foregroundCompactWithTask( } if err != nil { - return err + return MutableSegmentsStats{}, err } + // Add a read through cache for repeated expensive queries against + // compacted segments since they can live for quite some time during + // block rotations while a burst of segments are created. + segment := m.newReadThroughSegment(compacted) + // Rotate in the ones we just compacted. m.Lock() defer m.Unlock() result := m.addCompactedSegmentFromSegmentsWithLock(m.foregroundSegments, - segments, compacted) + segments, segment) m.foregroundSegments = result - return nil + foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments) + backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments) + return MutableSegmentsStats{ + Foreground: MutableSegmentsSegmentStats{ + NumSegments: foregroundNumSegments, + NumDocs: foregroundNumDocs, + }, + Background: MutableSegmentsSegmentStats{ + NumSegments: backgroundNumSegments, + NumDocs: backgroundNumDocs, + }, + }, nil } func (m *mutableSegments) cleanupForegroundCompactWithLock() { @@ -760,33 +1038,35 @@ func (m *mutableSegments) cleanupCompactWithLock() { if !m.compact.compactingForeground { m.cleanupForegroundCompactWithLock() } - if !m.compact.compactingBackground { + if !m.compact.compactingBackgroundStandard && !m.compact.compactingBackgroundGarbageCollect { m.cleanupBackgroundCompactWithLock() } } // mutableSegmentsCompact has several lazily allocated compaction components. type mutableSegmentsCompact struct { - segmentBuilder segment.CloseableDocumentsBuilder - foregroundCompactor *compaction.Compactor - backgroundCompactor *compaction.Compactor - compactingForeground bool - compactingBackground bool - numForeground int - numBackground int + opts Options + blockOpts BlockOptions + + segmentBuilder segment.CloseableDocumentsBuilder + foregroundCompactor *compaction.Compactor + backgroundCompactors chan *compaction.Compactor + compactingForeground bool + compactingBackgroundStandard bool + compactingBackgroundGarbageCollect bool + numForeground int + numBackground int } func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock( concurrency int, - blockOpts BlockOptions, - opts Options, ) error { var ( err error - metadataPool = opts.MetadataArrayPool() + metadataPool = m.opts.MetadataArrayPool() ) if m.segmentBuilder == nil { - builderOpts := opts.SegmentBuilderOptions(). + builderOpts := m.opts.SegmentBuilderOptions(). SetConcurrency(concurrency) m.segmentBuilder, err = builder.NewBuilderFromDocuments(builderOpts) @@ -798,8 +1078,8 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock( if m.foregroundCompactor == nil { m.foregroundCompactor, err = compaction.NewCompactor(metadataPool, MetadataArrayPoolCapacity, - opts.SegmentBuilderOptions(), - opts.FSTSegmentOptions(), + m.opts.SegmentBuilderOptions(), + m.opts.FSTSegmentOptions(), compaction.CompactorOptions{ FSTWriterOptions: &fst.WriterOptions{ // DisableRegistry is set to true to trade a larger FST size @@ -807,27 +1087,55 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock( // to end latency for time to first index a metric. DisableRegistry: true, }, - MmapDocsData: blockOpts.ForegroundCompactorMmapDocsData, + MmapDocsData: m.blockOpts.ForegroundCompactorMmapDocsData, }) if err != nil { return err } } - if m.backgroundCompactor == nil { - m.backgroundCompactor, err = compaction.NewCompactor(metadataPool, + if m.backgroundCompactors == nil { + n := numBackgroundCompactorsStandard + m.backgroundCompactors = make(chan *compaction.Compactor, n) + for i := 0; i < n; i++ { + backgroundCompactor, err := compaction.NewCompactor(metadataPool, + MetadataArrayPoolCapacity, + m.opts.SegmentBuilderOptions(), + m.opts.FSTSegmentOptions(), + compaction.CompactorOptions{ + MmapDocsData: m.blockOpts.BackgroundCompactorMmapDocsData, + }) + if err != nil { + return err + } + m.backgroundCompactors <- backgroundCompactor + } + } + + return nil +} + +func (m *mutableSegmentsCompact) allocBackgroundCompactorsGarbageCollect() ( + chan *compaction.Compactor, + error, +) { + metadataPool := m.opts.MetadataArrayPool() + n := numBackgroundCompactorsGarbageCollect + compactors := make(chan *compaction.Compactor, n) + for i := 0; i < n; i++ { + backgroundCompactor, err := compaction.NewCompactor(metadataPool, MetadataArrayPoolCapacity, - opts.SegmentBuilderOptions(), - opts.FSTSegmentOptions(), + m.opts.SegmentBuilderOptions(), + m.opts.FSTSegmentOptions(), compaction.CompactorOptions{ - MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData, + MmapDocsData: m.blockOpts.BackgroundCompactorMmapDocsData, }) if err != nil { - return err + return nil, err } + compactors <- backgroundCompactor } - - return nil + return compactors, nil } func taskNumBuilders(task compaction.Task) int { diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go index 38d5a6e4c4..5d39f6b35e 100644 --- a/src/dbnode/storage/index/read_through_segment.go +++ b/src/dbnode/storage/index/read_through_segment.go @@ -75,7 +75,7 @@ func NewReadThroughSegment( seg segment.ImmutableSegment, cache *PostingsListCache, opts ReadThroughSegmentOptions, -) segment.Segment { +) *ReadThroughSegment { return &ReadThroughSegment{ segment: seg, opts: opts, diff --git a/src/dbnode/storage/index/read_through_segment_test.go b/src/dbnode/storage/index/read_through_segment_test.go index 3ef3a2c18e..fe604e9628 100644 --- a/src/dbnode/storage/index/read_through_segment_test.go +++ b/src/dbnode/storage/index/read_through_segment_test.go @@ -275,7 +275,7 @@ func TestClose(t *testing.T) { readThroughSeg := NewReadThroughSegment( segment, cache, defaultReadThroughSegmentOptions) - segmentUUID := readThroughSeg.(*ReadThroughSegment).uuid + segmentUUID := readThroughSeg.uuid // Store an entry for the segment in the cache so we can check if it // gets purged after. @@ -284,7 +284,7 @@ func TestClose(t *testing.T) { segment.EXPECT().Close().Return(nil) err = readThroughSeg.Close() require.NoError(t, err) - require.True(t, readThroughSeg.(*ReadThroughSegment).closed) + require.True(t, readThroughSeg.closed) // Make sure it does not allow double closes. err = readThroughSeg.Close() @@ -413,5 +413,5 @@ func TestCloseNoCache(t *testing.T) { seg.EXPECT().Close().Return(nil) err := readThrough.Close() require.NoError(t, err) - require.True(t, readThrough.(*ReadThroughSegment).closed) + require.True(t, readThrough.closed) } diff --git a/src/dbnode/storage/index/segments.go b/src/dbnode/storage/index/segments.go index ce3d8ae5b3..6f3134d1c9 100644 --- a/src/dbnode/storage/index/segments.go +++ b/src/dbnode/storage/index/segments.go @@ -28,12 +28,16 @@ import ( ) type readableSeg struct { - nowFn clock.NowFn - createdAt time.Time - segment segment.Segment + nowFn clock.NowFn + createdAt time.Time + segment segment.Segment + garbageCollecting bool } -func newReadableSeg(seg segment.Segment, opts Options) *readableSeg { +func newReadableSeg( + seg segment.Segment, + opts Options, +) *readableSeg { nowFn := opts.ClockOptions().NowFn() return &readableSeg{ nowFn: nowFn, diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go index 7a2b924fa9..213483f5db 100644 --- a/src/dbnode/storage/index/types.go +++ b/src/dbnode/storage/index/types.go @@ -365,38 +365,6 @@ type AggregateResultsEntry struct { Terms []ident.ID } -// OnIndexSeries provides a set of callback hooks to allow the reverse index -// to do lifecycle management of any resources retained during indexing. -type OnIndexSeries interface { - // OnIndexSuccess is executed when an entry is successfully indexed. The - // provided value for `blockStart` is the blockStart for which the write - // was indexed. - OnIndexSuccess(blockStart xtime.UnixNano) - - // OnIndexFinalize is executed when the index no longer holds any references - // to the provided resources. It can be used to cleanup any resources held - // during the course of indexing. `blockStart` is the startTime of the index - // block for which the write was attempted. - OnIndexFinalize(blockStart xtime.UnixNano) - - // OnIndexPrepare prepares the Entry to be handed off to the indexing sub-system. - // NB(prateek): we retain the ref count on the entry while the indexing is pending, - // the callback executed on the entry once the indexing is completed releases this - // reference. - OnIndexPrepare() - - // NeedsIndexUpdate returns a bool to indicate if the Entry needs to be indexed - // for the provided blockStart. It only allows a single index attempt at a time - // for a single entry. - // NB(prateek): NeedsIndexUpdate is a CAS, i.e. when this method returns true, it - // also sets state on the entry to indicate that a write for the given blockStart - // is going to be sent to the index, and other go routines should not attempt the - // same write. Callers are expected to ensure they follow this guideline. - // Further, every call to NeedsIndexUpdate which returns true needs to have a corresponding - // OnIndexFinalze() call. This is required for correct lifecycle maintenance. - NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) bool -} - // Block represents a collection of segments. Each `Block` is a complete reverse // index for a period of time defined by [StartTime, EndTime). type Block interface { @@ -444,6 +412,9 @@ type Block interface { // Stats returns block stats. Stats(reporter BlockStatsReporter) error + // IsOpen returns true if open and not sealed yet. + IsOpen() bool + // Seal prevents the block from taking any more writes, but, it still permits // addition of segments via Bootstrap(). Seal() error @@ -473,11 +444,14 @@ type Block interface { // RotateColdMutableSegments rotates the currently active cold mutable segment out for a // new cold mutable segment to write to. - RotateColdMutableSegments() + RotateColdMutableSegments() error // MemorySegmentsData returns all in memory segments data. MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) + // BackgroundCompact background compacts eligible segments. + BackgroundCompact() + // Close will release any held resources and close the Block. Close() error } @@ -553,8 +527,29 @@ const ( // WriteBatchResult returns statistics about the WriteBatch execution. type WriteBatchResult struct { - NumSuccess int64 - NumError int64 + NumSuccess int64 + NumError int64 + MutableSegmentsStats MutableSegmentsStats +} + +// MutableSegmentsStats contains metadata about +// an insertion into mutable segments. +type MutableSegmentsStats struct { + Foreground MutableSegmentsSegmentStats + Background MutableSegmentsSegmentStats +} + +// MutableSegmentsSegmentStats contains metadata about +// a set of mutable segments segment type. +type MutableSegmentsSegmentStats struct { + NumSegments int64 + NumDocs int64 +} + +// Empty returns whether stats is empty or not. +func (s MutableSegmentsStats) Empty() bool { + return s.Foreground == MutableSegmentsSegmentStats{} && + s.Background == MutableSegmentsSegmentStats{} } // BlockTickResult returns statistics about tick. @@ -728,6 +723,11 @@ func (b *WriteBatch) ForEachUnmarkedBatchByBlockStart( } } +// PendingAny returns whether there are any pending documents to be inserted. +func (b *WriteBatch) PendingAny() bool { + return len(b.PendingDocs()) > 0 +} + func (b *WriteBatch) numPending() int { numUnmarked := 0 for i := range b.entries { @@ -793,13 +793,32 @@ func (b *WriteBatch) SortByEnqueued() { // MarkUnmarkedEntriesSuccess marks all unmarked entries as success. func (b *WriteBatch) MarkUnmarkedEntriesSuccess() { + for idx := range b.entries { + b.MarkEntrySuccess(idx) + } +} + +// MarkEntrySuccess marks an entry as success. +func (b *WriteBatch) MarkEntrySuccess(idx int) { + if !b.entries[idx].result.Done { + blockStart := b.entries[idx].indexBlockStart(b.opts.IndexBlockSize) + b.entries[idx].OnIndexSeries.OnIndexSuccess(blockStart) + b.entries[idx].OnIndexSeries.OnIndexFinalize(blockStart) + b.entries[idx].result.Done = true + b.entries[idx].result.Err = nil + } +} + +// MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize marks an entry as success. +func (b *WriteBatch) MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize() { for idx := range b.entries { if !b.entries[idx].result.Done { blockStart := b.entries[idx].indexBlockStart(b.opts.IndexBlockSize) - b.entries[idx].OnIndexSeries.OnIndexSuccess(blockStart) - b.entries[idx].OnIndexSeries.OnIndexFinalize(blockStart) - b.entries[idx].result.Done = true - b.entries[idx].result.Err = nil + r := b.entries[idx].OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart) + if r { + b.entries[idx].result.Done = true + b.entries[idx].result.Err = nil + } } } } @@ -870,7 +889,7 @@ type WriteBatchEntry struct { Timestamp xtime.UnixNano // OnIndexSeries is a listener/callback for when this entry is marked done // it is set to nil when the entry is marked done - OnIndexSeries OnIndexSeries + OnIndexSeries doc.OnIndexSeries // EnqueuedAt is the timestamp that this entry was enqueued for indexing // so that we can calculate the latency it takes to index the entry EnqueuedAt time.Time diff --git a/src/dbnode/storage/index/write_batch_test.go b/src/dbnode/storage/index/write_batch_test.go index 1f7edef18a..323896e3f5 100644 --- a/src/dbnode/storage/index/write_batch_test.go +++ b/src/dbnode/storage/index/write_batch_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" "github.com/golang/mock/gomock" + "github.com/m3db/m3/src/m3ninx/doc" xtime "github.com/m3db/m3/src/x/time" ) @@ -44,14 +45,14 @@ func TestWriteBatchSortByUnmarkedAndIndexBlockStart(t *testing.T) { Truncate(blockSize). Add(time.Minute) - h1 := NewMockOnIndexSeries(ctrl) + h1 := doc.NewMockOnIndexSeries(ctrl) h1.EXPECT().OnIndexFinalize(blockStart) h1.EXPECT().OnIndexSuccess(blockStart) - h2 := NewMockOnIndexSeries(ctrl) + h2 := doc.NewMockOnIndexSeries(ctrl) h2.EXPECT().OnIndexFinalize(blockStart) - h3 := NewMockOnIndexSeries(ctrl) + h3 := doc.NewMockOnIndexSeries(ctrl) h3.EXPECT().OnIndexFinalize(blockStart) h3.EXPECT().OnIndexSuccess(blockStart) diff --git a/src/dbnode/storage/index_block_test.go b/src/dbnode/storage/index_block_test.go index f8b970ec5b..6b6029b410 100644 --- a/src/dbnode/storage/index_block_test.go +++ b/src/dbnode/storage/index_block_test.go @@ -98,7 +98,7 @@ func testWriteBatchEntry( id ident.ID, tags ident.Tags, timestamp xtime.UnixNano, - fns index.OnIndexSeries, + fns doc.OnIndexSeries, ) (index.WriteBatchEntry, doc.Metadata) { d := doc.Metadata{ID: copyBytes(id.Bytes())} for _, tag := range tags.Values() { @@ -143,15 +143,22 @@ func TestNamespaceIndexNewBlockFn(t *testing.T) { mockBlock := index.NewMockBlock(ctrl) mockBlock.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() - mockBlock.EXPECT().Close().Return(nil) + mockBlock.EXPECT().StartTime().Return(now.Truncate(blockSize)).AnyTimes() + mockBlock.EXPECT().Close().Return(nil).AnyTimes() newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { - require.Equal(t, now.Truncate(blockSize), ts) + // If active block, the blockStart should be zero. + // Otherwise, it should match the actual time. + if opts.ActiveBlock { + require.Equal(t, xtime.UnixNano(0), ts) + } else { + require.Equal(t, now.Truncate(blockSize), ts) + } return mockBlock, nil } md := testNamespaceMetadata(blockSize, 4*time.Hour) @@ -164,10 +171,10 @@ func TestNamespaceIndexNewBlockFn(t *testing.T) { require.NoError(t, index.Close()) }() - blocksSlice := index.(*nsIndex).state.blockStartsDescOrder + blocksSlice := index.(*nsIndex).state.blocksDescOrderImmutable require.Equal(t, 1, len(blocksSlice)) - require.Equal(t, now.Truncate(blockSize), blocksSlice[0]) + require.Equal(t, now.Truncate(blockSize), blocksSlice[0].blockStart) require.Equal(t, mockBlock, index.(*nsIndex).state.latestBlock) @@ -218,16 +225,22 @@ func TestNamespaceIndexWrite(t *testing.T) { mockBlock := index.NewMockBlock(ctrl) mockBlock.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() - mockBlock.EXPECT().Close().Return(nil) + mockBlock.EXPECT().Close().Return(nil).Times(2) // active and normal mockBlock.EXPECT().StartTime().Return(now.Truncate(blockSize)).AnyTimes() newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { - require.Equal(t, now.Truncate(blockSize), ts) + // If active block, the blockStart should be zero. + // Otherwise, it should match the actual time. + if opts.ActiveBlock { + require.Equal(t, xtime.UnixNano(0), ts) + } else { + require.Equal(t, now.Truncate(blockSize), ts) + } return mockBlock, nil } md := testNamespaceMetadata(blockSize, 4*time.Hour) @@ -243,22 +256,9 @@ func TestNamespaceIndexWrite(t *testing.T) { id := ident.StringID("foo") tag := ident.StringTag("name", "value") tags := ident.NewTags(tag) - lifecycle := index.NewMockOnIndexSeries(ctrl) - mockBlock.EXPECT(). - WriteBatch(gomock.Any()). - Return(index.WriteBatchResult{}, nil). - Do(func(batch *index.WriteBatch) { - docs := batch.PendingDocs() - require.Equal(t, 1, len(docs)) - require.Equal(t, doc.Metadata{ - ID: id.Bytes(), - Fields: doc.Fields{{Name: tag.Name.Bytes(), Value: tag.Value.Bytes()}}, - }, docs[0]) - entries := batch.PendingEntries() - require.Equal(t, 1, len(entries)) - require.True(t, entries[0].Timestamp.Equal(now)) - require.True(t, entries[0].OnIndexSeries == lifecycle) // Just ptr equality - }) + lifecycle := doc.NewMockOnIndexSeries(ctrl) + mockWriteBatch(t, &now, lifecycle, mockBlock, &tag) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false) batch := index.NewWriteBatch(index.WriteBatchOptions{ IndexBlockSize: blockSize, }) @@ -283,21 +283,27 @@ func TestNamespaceIndexWriteCreatesBlock(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) b0.EXPECT().StartTime().Return(t0).AnyTimes() b1 := index.NewMockBlock(ctrl) b1.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() - b1.EXPECT().Close().Return(nil) b1.EXPECT().StartTime().Return(t1).AnyTimes() newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -319,23 +325,11 @@ func TestNamespaceIndexWriteCreatesBlock(t *testing.T) { id := ident.StringID("foo") tag := ident.StringTag("name", "value") tags := ident.NewTags(tag) - lifecycle := index.NewMockOnIndexSeries(ctrl) - b1.EXPECT(). - WriteBatch(gomock.Any()). - Return(index.WriteBatchResult{}, nil). - Do(func(batch *index.WriteBatch) { - docs := batch.PendingDocs() - require.Equal(t, 1, len(docs)) - require.Equal(t, doc.Metadata{ - ID: id.Bytes(), - Fields: doc.Fields{{Name: tag.Name.Bytes(), Value: tag.Value.Bytes()}}, - }, docs[0]) - entries := batch.PendingEntries() - require.Equal(t, 1, len(entries)) - require.True(t, entries[0].Timestamp.Equal(now)) - require.True(t, entries[0].OnIndexSeries == lifecycle) // Just ptr equality - }) - + lifecycle := doc.NewMockOnIndexSeries(ctrl) + mockWriteBatch(t, &now, lifecycle, bActive, &tag) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()). + Return(false). + AnyTimes() nowLock.Lock() now = now.Add(blockSize) nowLock.Unlock() @@ -363,6 +357,9 @@ func TestNamespaceIndexBootstrap(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().StartTime().Return(t0).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().StartTime().Return(t0).AnyTimes() @@ -372,10 +369,13 @@ func TestNamespaceIndexBootstrap(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -426,16 +426,22 @@ func TestNamespaceIndexTickExpire(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().StartTime().Return(t0).AnyTimes() newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -452,11 +458,16 @@ func TestNamespaceIndexTickExpire(t *testing.T) { nowLock.Unlock() c := context.NewCancellable() + + bActive.EXPECT().Tick(c).Return(index.BlockTickResult{}, nil) + b0.EXPECT().Close().Return(nil) + result, err := idx.Tick(c, xtime.ToUnixNano(nowFn())) require.NoError(t, err) require.Equal(t, namespaceIndexTickResult{ - NumBlocksEvicted: 1, + NumBlocks: 0, + NumBlocksEvicted: 0, }, result) } @@ -477,6 +488,10 @@ func TestNamespaceIndexTick(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -484,10 +499,13 @@ func TestNamespaceIndexTick(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -504,16 +522,28 @@ func TestNamespaceIndexTick(t *testing.T) { }() c := context.NewCancellable() - b0.EXPECT().Tick(c).Return(index.BlockTickResult{ - NumDocs: 10, - NumSegments: 2, - }, nil) + + bActive.EXPECT().Tick(c). + Return(index.BlockTickResult{ + NumDocs: 10, + NumSegments: 2, + }, nil). + AnyTimes() + bActive.EXPECT().IsSealed().Return(false).AnyTimes() + + b0.EXPECT().Tick(c). + Return(index.BlockTickResult{ + NumDocs: 10, + NumSegments: 2, + }, nil) + b0.EXPECT().IsSealed().Return(false) + result, err := idx.Tick(c, xtime.ToUnixNano(nowFn())) require.NoError(t, err) require.Equal(t, namespaceIndexTickResult{ NumBlocks: 1, - NumSegments: 2, - NumTotalDocs: 10, + NumSegments: 4, + NumTotalDocs: 20, }, result) nowLock.Lock() @@ -524,28 +554,27 @@ func TestNamespaceIndexTick(t *testing.T) { NumDocs: 10, NumSegments: 2, }, nil) - b0.EXPECT().IsSealed().Return(false) - b0.EXPECT().Seal().Return(nil) + b0.EXPECT().IsSealed().Return(false).Times(1) + b0.EXPECT().Seal().Return(nil).AnyTimes() result, err = idx.Tick(c, xtime.ToUnixNano(nowFn())) require.NoError(t, err) require.Equal(t, namespaceIndexTickResult{ NumBlocks: 1, - NumBlocksSealed: 1, - NumSegments: 2, - NumTotalDocs: 10, + NumBlocksSealed: 0, + NumSegments: 4, + NumTotalDocs: 20, }, result) b0.EXPECT().Tick(c).Return(index.BlockTickResult{ NumDocs: 10, NumSegments: 2, }, nil) - b0.EXPECT().IsSealed().Return(true) result, err = idx.Tick(c, xtime.ToUnixNano(nowFn())) require.NoError(t, err) require.Equal(t, namespaceIndexTickResult{ NumBlocks: 1, - NumSegments: 2, - NumTotalDocs: 10, + NumSegments: 4, + NumTotalDocs: 20, }, result) } @@ -568,6 +597,11 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t0.Add(blockSize)).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -581,10 +615,13 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -637,12 +674,19 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { EndExclusive: now.Add(time.Minute), } + // Lock to prevent race given these blocks are processed concurrently. + var resultLock sync.Mutex + // create initial span from a mock tracer and get ctx mtr := mocktracer.New() sp := mtr.StartSpan("root") ctx.SetGoContext(opentracing.ContextWithSpan(stdlibctx.Background(), sp)) + mockIterActive := index.NewMockQueryIterator(ctrl) mockIter0 := index.NewMockQueryIterator(ctrl) + bActive.EXPECT().QueryIter(gomock.Any(), q).Return(mockIterActive, nil) + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) b0.EXPECT().QueryIter(gomock.Any(), q).Return(mockIter0, nil) mockIter0.EXPECT().Done().Return(true) mockIter0.EXPECT().Close().Return(nil) @@ -657,6 +701,9 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { EndExclusive: t2.Add(time.Minute), RequireExhaustive: test.requireExhaustive, } + bActive.EXPECT().QueryIter(gomock.Any(), q).Return(mockIterActive, nil) + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) b0.EXPECT().QueryIter(gomock.Any(), q).Return(mockIter0, nil) mockIter0.EXPECT().Done().Return(true) mockIter0.EXPECT().Close().Return(nil) @@ -677,28 +724,13 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { RequireExhaustive: test.requireExhaustive, SeriesLimit: 1, } - b0.EXPECT().QueryIter(gomock.Any(), q).Return(mockIter0, nil) - b0.EXPECT().QueryWithIter(gomock.Any(), qOpts, mockIter0, gomock.Any(), gomock.Any(), gomock.Any()). - DoAndReturn(func( - ctx context.Context, - opts index.QueryOptions, - iter index.QueryIterator, - r index.QueryResults, - deadline time.Time, - logFields []opentracinglog.Field, - ) error { - _, _, err = r.AddDocuments([]doc.Document{ - doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), - doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("B")}), - }) - require.NoError(t, err) - return nil - }) - gomock.InOrder( - mockIter0.EXPECT().Done().Return(false), - mockIter0.EXPECT().Done().Return(true), - mockIter0.EXPECT().Close().Return(nil), - ) + + docs := []doc.Document{ + doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), + doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("B")}), + } + mockQueryWithIter(t, mockIterActive, bActive, q, qOpts, &resultLock, docs) + mockQueryWithIter(t, mockIter0, b0, q, qOpts, &resultLock, docs) result, err = idx.Query(ctx, q, qOpts) if test.requireExhaustive { @@ -711,7 +743,7 @@ func TestNamespaceIndexBlockQuery(t *testing.T) { sp.Finish() spans := mtr.FinishedSpans() - require.Len(t, spans, 8) + require.Len(t, spans, 9) }) } } @@ -734,6 +766,11 @@ func TestLimits(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil).AnyTimes() + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t0.Add(blockSize)).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil).AnyTimes() @@ -742,10 +779,13 @@ func TestLimits(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -814,7 +854,7 @@ func TestLimits(t *testing.T) { docsLimit: 0, requireExhaustive: true, expectedErr: "query exceeded limit: require_exhaustive=true, " + - "series_limit=1, series_matched=1, docs_limit=0, docs_matched=2", + "series_limit=1, series_matched=1, docs_limit=0, docs_matched=4", expectedQueryLimitExceededError: true, }, { @@ -823,7 +863,7 @@ func TestLimits(t *testing.T) { docsLimit: 1, requireExhaustive: true, expectedErr: "query exceeded limit: require_exhaustive=true, " + - "series_limit=0, series_matched=1, docs_limit=1, docs_matched=2", + "series_limit=0, series_matched=1, docs_limit=1, docs_matched=4", expectedQueryLimitExceededError: true, }, { @@ -832,7 +872,7 @@ func TestLimits(t *testing.T) { docsLimit: 1, requireExhaustive: true, expectedErr: "query exceeded limit: require_exhaustive=true, " + - "series_limit=1, series_matched=1, docs_limit=1, docs_matched=2", + "series_limit=1, series_matched=1, docs_limit=1, docs_matched=4", expectedQueryLimitExceededError: true, }, } { @@ -848,37 +888,27 @@ func TestLimits(t *testing.T) { RequireExhaustive: test.requireExhaustive, } + // Lock to prevent race given these blocks are processed concurrently. + var resultLock sync.Mutex + // create initial span from a mock tracer and get ctx mtr := mocktracer.New() sp := mtr.StartSpan("root") ctx.SetGoContext(opentracing.ContextWithSpan(stdlibctx.Background(), sp)) + mockIterActive := index.NewMockQueryIterator(ctrl) mockIter := index.NewMockQueryIterator(ctrl) - b0.EXPECT().QueryIter(gomock.Any(), q).Return(mockIter, nil) - gomock.InOrder( - mockIter.EXPECT().Done().Return(false), - mockIter.EXPECT().Done().Return(true), - mockIter.EXPECT().Close().Return(err), - ) - b0.EXPECT().QueryWithIter(gomock.Any(), qOpts, mockIter, gomock.Any(), gomock.Any(), gomock.Any()). - DoAndReturn(func(ctx context.Context, - opts interface{}, - iter interface{}, - results index.DocumentResults, - deadline interface{}, - logFields interface{}) error { - _, _, err = results.AddDocuments([]doc.Document{ - // Results in size=1 and docs=2. - // Byte array represents ID encoded as bytes. - // 1 represents the ID length in bytes, 49 is the ID itself which is - // the ASCII value for A - doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), - doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), - }) - require.NoError(t, err) - return nil - }) + docs := []doc.Document{ + // Results in size=1 and docs=2. + // Byte array represents ID encoded as bytes. + // 1 represents the ID length in bytes, 49 is the ID itself which is + // the ASCII value for A + doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), + doc.NewDocumentFromMetadata(doc.Metadata{ID: []byte("A")}), + } + mockQueryWithIter(t, mockIterActive, bActive, q, qOpts, &resultLock, docs) + mockQueryWithIter(t, mockIter, b0, q, qOpts, &resultLock, docs) result, err := idx.Query(ctx, q, qOpts) if test.seriesLimit == 0 && test.docsLimit == 0 { @@ -918,6 +948,11 @@ func TestNamespaceIndexBlockQueryReleasingContext(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t0.Add(blockSize)).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -931,10 +966,13 @@ func TestNamespaceIndexBlockQueryReleasingContext(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -987,14 +1025,20 @@ func TestNamespaceIndexBlockQueryReleasingContext(t *testing.T) { StartInclusive: t0, EndExclusive: now.Add(time.Minute), } + mockIterActive := index.NewMockQueryIterator(ctrl) mockIter := index.NewMockQueryIterator(ctrl) gomock.InOrder( mockPool.EXPECT().Get().Return(stubResult), + bActive.EXPECT().QueryIter(ctx, q).Return(mockIterActive, nil), b0.EXPECT().QueryIter(ctx, q).Return(mockIter, nil), - mockIter.EXPECT().Done().Return(true), - mockIter.EXPECT().Close().Return(nil), mockPool.EXPECT().Put(stubResult), ) + + mockIter.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Done().Return(true) + mockIter.EXPECT().Close().Return(nil) + mockIterActive.EXPECT().Close().Return(nil) + _, err = idx.Query(ctx, q, qOpts) require.NoError(t, err) ctx.BlockingClose() @@ -1020,6 +1064,11 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t0.Add(blockSize)).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -1033,10 +1082,13 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -1099,6 +1151,10 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { } aggOpts := index.AggregationOptions{QueryOptions: qOpts} + mockIterActive := index.NewMockAggregateIterator(ctrl) + bActive.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIterActive, nil) + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) mockIter0 := index.NewMockAggregateIterator(ctrl) b0.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIter0, nil) mockIter0.EXPECT().Done().Return(true) @@ -1114,6 +1170,9 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { RequireExhaustive: test.requireExhaustive, } aggOpts = index.AggregationOptions{QueryOptions: qOpts} + bActive.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIterActive, nil) + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) b0.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIter0, nil) mockIter0.EXPECT().Done().Return(true) mockIter0.EXPECT().Close().Return(nil) @@ -1133,6 +1192,32 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { RequireExhaustive: test.requireExhaustive, DocsLimit: 1, } + bActive.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIterActive, nil) + //nolint: dupl + bActive.EXPECT(). + AggregateWithIter(gomock.Any(), mockIter0, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + ctx context.Context, + iter index.AggregateIterator, + opts index.QueryOptions, + results index.AggregateResults, + deadline time.Time, + logFields []opentracinglog.Field, + ) error { + _, _ = results.AddFields([]index.AggregateResultsEntry{{ + Field: ident.StringID("A"), + Terms: []ident.ID{ident.StringID("foo")}, + }, { + Field: ident.StringID("B"), + Terms: []ident.ID{ident.StringID("bar")}, + }}) + return nil + }) + gomock.InOrder( + mockIterActive.EXPECT().Done().Return(false), + mockIterActive.EXPECT().Done().Return(true), + mockIterActive.EXPECT().Close().Return(nil), + ) b0.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIter0, nil) //nolint: dupl b0.EXPECT(). @@ -1171,7 +1256,7 @@ func TestNamespaceIndexBlockAggregateQuery(t *testing.T) { sp.Finish() spans := mtr.FinishedSpans() - require.Len(t, spans, 8) + require.Len(t, spans, 9) }) } } @@ -1196,6 +1281,11 @@ func TestNamespaceIndexBlockAggregateQueryReleasingContext(t *testing.T) { opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) query := idx.NewTermQuery([]byte("a"), []byte("b")) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t0.Add(blockSize)).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -1209,10 +1299,13 @@ func TestNamespaceIndexBlockAggregateQueryReleasingContext(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -1271,14 +1364,19 @@ func TestNamespaceIndexBlockAggregateQueryReleasingContext(t *testing.T) { } aggOpts := index.AggregationOptions{QueryOptions: qOpts} + mockIterActive := index.NewMockAggregateIterator(ctrl) mockIter := index.NewMockAggregateIterator(ctrl) gomock.InOrder( mockPool.EXPECT().Get().Return(stubResult), + bActive.EXPECT().AggregateIter(ctx, gomock.Any()).Return(mockIterActive, nil), b0.EXPECT().AggregateIter(ctx, gomock.Any()).Return(mockIter, nil), - mockIter.EXPECT().Done().Return(true), - mockIter.EXPECT().Close().Return(nil), mockPool.EXPECT().Put(stubResult), ) + mockIter.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Done().Return(true) + mockIter.EXPECT().Close().Return(nil) + mockIterActive.EXPECT().Close().Return(nil) + _, err = idx.AggregateQuery(ctx, q, aggOpts) require.NoError(t, err) ctx.BlockingClose() @@ -1304,6 +1402,11 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { opts := DefaultTestOptions() opts = opts.SetClockOptions(opts.ClockOptions().SetNowFn(nowFn)) + bActive := index.NewMockBlock(ctrl) + bActive.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + bActive.EXPECT().Close().Return(nil) + bActive.EXPECT().StartTime().Return(t0).AnyTimes() + bActive.EXPECT().EndTime().Return(t1).AnyTimes() b0 := index.NewMockBlock(ctrl) b0.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() b0.EXPECT().Close().Return(nil) @@ -1317,10 +1420,13 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock { + return bActive, nil + } if ts.Equal(t0) { return b0, nil } @@ -1378,6 +1484,10 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { q := index.Query{ Query: query, } + mockIterActive := index.NewMockAggregateIterator(ctrl) + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) + bActive.EXPECT().AggregateIter(ctx, gomock.Any()).Return(mockIterActive, nil) mockIter0 := index.NewMockAggregateIterator(ctrl) mockIter0.EXPECT().Done().Return(true) mockIter0.EXPECT().Close().Return(nil) @@ -1394,6 +1504,10 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { } aggOpts = index.AggregationOptions{QueryOptions: qOpts} + mockIterActive.EXPECT().Done().Return(true) + mockIterActive.EXPECT().Close().Return(nil) + bActive.EXPECT().AggregateIter(ctx, gomock.Any()).Return(mockIterActive, nil) + mockIter0.EXPECT().Done().Return(true) mockIter0.EXPECT().Close().Return(nil) b0.EXPECT().AggregateIter(ctx, gomock.Any()).Return(mockIter0, nil) @@ -1413,6 +1527,32 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { RequireExhaustive: test.requireExhaustive, DocsLimit: 1, } + bActive.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIterActive, nil) + //nolint: dupl + bActive.EXPECT(). + AggregateWithIter(gomock.Any(), mockIter0, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + ctx context.Context, + iter index.AggregateIterator, + opts index.QueryOptions, + results index.AggregateResults, + deadline time.Time, + logFields []opentracinglog.Field, + ) error { + _, _ = results.AddFields([]index.AggregateResultsEntry{{ + Field: ident.StringID("A"), + Terms: []ident.ID{ident.StringID("foo")}, + }, { + Field: ident.StringID("B"), + Terms: []ident.ID{ident.StringID("bar")}, + }}) + return nil + }) + gomock.InOrder( + mockIterActive.EXPECT().Done().Return(false), + mockIterActive.EXPECT().Done().Return(true), + mockIterActive.EXPECT().Close().Return(nil), + ) b0.EXPECT().AggregateIter(gomock.Any(), gomock.Any()).Return(mockIter0, nil) //nolint: dupl b0.EXPECT(). @@ -1452,3 +1592,57 @@ func TestNamespaceIndexBlockAggregateQueryAggPath(t *testing.T) { }) } } + +func mockWriteBatch(t *testing.T, + now *xtime.UnixNano, + lifecycle *doc.MockOnIndexSeries, + block *index.MockBlock, + tag *ident.Tag, +) { + block.EXPECT(). + WriteBatch(gomock.Any()). + Return(index.WriteBatchResult{}, nil). + Do(func(batch *index.WriteBatch) { + docs := batch.PendingDocs() + require.Equal(t, 1, len(docs)) + require.Equal(t, doc.Metadata{ + ID: id.Bytes(), + Fields: doc.Fields{{Name: tag.Name.Bytes(), Value: tag.Value.Bytes()}}, + }, docs[0]) + entries := batch.PendingEntries() + require.Equal(t, 1, len(entries)) + require.True(t, entries[0].Timestamp.Equal(*now)) + require.True(t, entries[0].OnIndexSeries == lifecycle) // Just ptr equality + }) +} + +func mockQueryWithIter(t *testing.T, + iter *index.MockQueryIterator, + block *index.MockBlock, + q index.Query, + qOpts index.QueryOptions, + resultLock *sync.Mutex, + docsToAdd []doc.Document, +) { + block.EXPECT().QueryIter(gomock.Any(), q).Return(iter, nil) + block.EXPECT().QueryWithIter(gomock.Any(), qOpts, iter, gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + ctx context.Context, + opts index.QueryOptions, + iter index.QueryIterator, + r index.QueryResults, + deadline time.Time, + logFields []opentracinglog.Field, + ) error { + resultLock.Lock() + defer resultLock.Unlock() + _, _, err := r.AddDocuments(docsToAdd) + require.NoError(t, err) + return nil + }) + gomock.InOrder( + iter.EXPECT().Done().Return(false), + iter.EXPECT().Done().Return(true), + iter.EXPECT().Close().Return(nil), + ) +} diff --git a/src/dbnode/storage/index_insert_queue_test.go b/src/dbnode/storage/index_insert_queue_test.go index b52aac8052..b69a3005c3 100644 --- a/src/dbnode/storage/index_insert_queue_test.go +++ b/src/dbnode/storage/index_insert_queue_test.go @@ -29,6 +29,7 @@ import ( "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/storage/index" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" xtime "github.com/m3db/m3/src/x/time" @@ -89,7 +90,7 @@ func TestIndexInsertQueueCallback(t *testing.T) { q = newTestIndexInsertQueue(newTestNamespaceMetadata(t)) insertLock sync.Mutex insertedBatches []*index.WriteBatch - callback = index.NewMockOnIndexSeries(ctrl) + callback = doc.NewMockOnIndexSeries(ctrl) ) q.indexBatchFn = func(inserts *index.WriteBatch) { insertLock.Lock() @@ -150,7 +151,7 @@ func TestIndexInsertQueueBatchBackoff(t *testing.T) { } q.indexBatchBackoff = backoff - callback := index.NewMockOnIndexSeries(ctrl) + callback := doc.NewMockOnIndexSeries(ctrl) var slept time.Duration var numSleeps int diff --git a/src/dbnode/storage/index_query_concurrent_test.go b/src/dbnode/storage/index_query_concurrent_test.go index e0903f9aea..49048eabc6 100644 --- a/src/dbnode/storage/index_query_concurrent_test.go +++ b/src/dbnode/storage/index_query_concurrent_test.go @@ -153,6 +153,7 @@ func testNamespaceIndexHighConcurrentQueries( blockIdx = -1 ) for st := min; !st.After(max); st = st.Add(test.indexBlockSize) { + st := st blockIdx++ blockStarts = append(blockStarts, st) @@ -161,7 +162,7 @@ func testNamespaceIndexHighConcurrentQueries( var onIndexWg sync.WaitGroup onIndexWg.Add(idsPerBlock) - onIndexSeries := index.NewMockOnIndexSeries(ctrl) + onIndexSeries := doc.NewMockOnIndexSeries(ctrl) onIndexSeries.EXPECT(). OnIndexSuccess(gomock.Any()). Times(idsPerBlock). @@ -171,6 +172,15 @@ func testNamespaceIndexHighConcurrentQueries( onIndexSeries.EXPECT(). OnIndexFinalize(gomock.Any()). Times(idsPerBlock) + onIndexSeries.EXPECT(). + IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()). + Times(idsPerBlock) + onIndexSeries.EXPECT(). + IndexedForBlockStart(gomock.Any()). + DoAndReturn(func(ts xtime.UnixNano) bool { + return ts.Equal(st) + }). + AnyTimes() batch := index.NewWriteBatch(index.WriteBatchOptions{ InitialCapacity: idsPerBlock, @@ -211,66 +221,12 @@ func testNamespaceIndexHighConcurrentQueries( restoreNow() nsIdx.state.Lock() - for start, block := range nsIdx.state.blocksByTime { - block := block // Capture for lambda - mockBlock := index.NewMockBlock(ctrl) - - mockBlock.EXPECT(). - StartTime(). - DoAndReturn(func() xtime.UnixNano { return block.StartTime() }). - AnyTimes() - mockBlock.EXPECT(). - EndTime(). - DoAndReturn(func() xtime.UnixNano { return block.EndTime() }). - AnyTimes() - mockBlock.EXPECT().QueryIter(gomock.Any(), gomock.Any()).DoAndReturn(func( - ctx context.Context, query index.Query) (index.QueryIterator, error) { - return block.QueryIter(ctx, query) - }, - ).AnyTimes() - - if opts.blockErrors { - mockBlock.EXPECT(). - QueryWithIter(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). - DoAndReturn(func( - _ context.Context, - _ index.QueryOptions, - _ index.QueryIterator, - _ index.QueryResults, - _ time.Time, - _ []opentracinglog.Field, - ) error { - return errors.New("some-error") - }). - AnyTimes() - } else { - mockBlock.EXPECT(). - QueryWithIter(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). - DoAndReturn(func( - ctx context.Context, - opts index.QueryOptions, - iter index.QueryIterator, - r index.QueryResults, - deadline time.Time, - logFields []opentracinglog.Field, - ) error { - time.Sleep(timeoutValue + time.Second) - return block.QueryWithIter(ctx, opts, iter, r, deadline, logFields) - }). - AnyTimes() - } - mockBlock.EXPECT(). - Stats(gomock.Any()). - Return(nil). - AnyTimes() - mockBlock.EXPECT(). - Close(). - DoAndReturn(func() error { - return block.Close() - }) - nsIdx.state.blocksByTime[start] = mockBlock + for start, block := range nsIdx.state.blocksByTime { + nsIdx.state.blocksByTime[start] = newMockBlock(ctrl, opts, timeoutValue, block) } + nsIdx.activeBlock = newMockBlock(ctrl, opts, timeoutValue, nsIdx.activeBlock) + nsIdx.state.Unlock() } @@ -414,3 +370,66 @@ func testNamespaceIndexHighConcurrentQueries( logger.Info("finished with timeouts") } } + +func newMockBlock(ctrl *gomock.Controller, + opts testNamespaceIndexHighConcurrentQueriesOptions, + timeout time.Duration, + block index.Block, +) *index.MockBlock { + mockBlock := index.NewMockBlock(ctrl) + mockBlock.EXPECT(). + StartTime(). + DoAndReturn(func() xtime.UnixNano { return block.StartTime() }). + AnyTimes() + mockBlock.EXPECT(). + EndTime(). + DoAndReturn(func() xtime.UnixNano { return block.EndTime() }). + AnyTimes() + mockBlock.EXPECT().QueryIter(gomock.Any(), gomock.Any()).DoAndReturn(func( + ctx context.Context, query index.Query) (index.QueryIterator, error) { + return block.QueryIter(ctx, query) + }, + ).AnyTimes() + + if opts.blockErrors { + mockBlock.EXPECT(). + QueryWithIter(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + _ context.Context, + _ index.QueryOptions, + _ index.QueryIterator, + _ index.QueryResults, + _ time.Time, + _ []opentracinglog.Field, + ) error { + return errors.New("some-error") + }). + AnyTimes() + } else { + mockBlock.EXPECT(). + QueryWithIter(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + ctx context.Context, + opts index.QueryOptions, + iter index.QueryIterator, + r index.QueryResults, + deadline time.Time, + logFields []opentracinglog.Field, + ) error { + time.Sleep(timeout + time.Second) + return block.QueryWithIter(ctx, opts, iter, r, deadline, logFields) + }). + AnyTimes() + } + + mockBlock.EXPECT(). + Stats(gomock.Any()). + Return(nil). + AnyTimes() + mockBlock.EXPECT(). + Close(). + DoAndReturn(func() error { + return block.Close() + }) + return mockBlock +} diff --git a/src/dbnode/storage/index_queue_forward_write_test.go b/src/dbnode/storage/index_queue_forward_write_test.go index 320e7d9055..ea1138fdca 100644 --- a/src/dbnode/storage/index_queue_forward_write_test.go +++ b/src/dbnode/storage/index_queue_forward_write_test.go @@ -77,6 +77,7 @@ func generateOptionsNowAndBlockSize() (Options, xtime.UnixNano, time.Duration) { func setupForwardIndex( t *testing.T, ctrl *gomock.Controller, + expectAggregateQuery bool, ) (NamespaceIndex, xtime.UnixNano, time.Duration) { newFn := func( fn nsIndexInsertBatchFn, @@ -99,27 +100,34 @@ func setupForwardIndex( require.NoError(t, err) var ( - ts = idx.(*nsIndex).state.latestBlock.StartTime() - nextTs = ts.Add(blockSize) - next = ts.Truncate(blockSize).Add(blockSize) - id = ident.StringID("foo") - tags = ident.NewTags( + ts = idx.(*nsIndex).state.latestBlock.StartTime() + nextTS = ts.Add(blockSize) + current = ts.Truncate(blockSize) + next = current.Add(blockSize) + tags = ident.NewTags( ident.StringTag("name", "value"), ) - lifecycle = index.NewMockOnIndexSeries(ctrl) + lifecycle = doc.NewMockOnIndexSeries(ctrl) ) gomock.InOrder( + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false), + lifecycle.EXPECT().NeedsIndexUpdate(next).Return(true), - lifecycle.EXPECT().OnIndexPrepare(), + lifecycle.EXPECT().OnIndexPrepare(next), lifecycle.EXPECT().OnIndexSuccess(ts), lifecycle.EXPECT().OnIndexFinalize(ts), - lifecycle.EXPECT().OnIndexSuccess(nextTs), - lifecycle.EXPECT().OnIndexFinalize(nextTs), + lifecycle.EXPECT().OnIndexSuccess(nextTS), + lifecycle.EXPECT().OnIndexFinalize(nextTS), ) + if !expectAggregateQuery { + lifecycle.EXPECT().IndexedForBlockStart(ts).Return(true) + lifecycle.EXPECT().IndexedForBlockStart(next).Return(true) + } + entry, doc := testWriteBatchEntry(id, tags, now, lifecycle) batch := testWriteBatch(entry, doc, testWriteBatchBlockSizeOption(blockSize)) require.NoError(t, idx.WriteBatch(batch)) @@ -135,7 +143,7 @@ func TestNamespaceForwardIndexInsertQuery(t *testing.T) { ctx := context.NewBackground() defer ctx.Close() - idx, now, blockSize := setupForwardIndex(t, ctrl) + idx, now, blockSize := setupForwardIndex(t, ctrl, false) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -177,7 +185,7 @@ func TestNamespaceForwardIndexAggregateQuery(t *testing.T) { ctx := context.NewBackground() defer ctx.Close() - idx, now, blockSize := setupForwardIndex(t, ctrl) + idx, now, blockSize := setupForwardIndex(t, ctrl, true) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -221,7 +229,7 @@ func TestNamespaceForwardIndexWideQuery(t *testing.T) { ctx := context.NewBackground() defer ctx.Close() - idx, now, blockSize := setupForwardIndex(t, ctrl) + idx, now, blockSize := setupForwardIndex(t, ctrl, false) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -274,14 +282,14 @@ func setupMockBlock( ts xtime.UnixNano, id ident.ID, tag ident.Tag, - lifecycle index.OnIndexSeries, + lifecycle doc.OnIndexSeries, ) { bl.EXPECT(). WriteBatch(gomock.Any()). Return(index.WriteBatchResult{}, nil). Do(func(batch *index.WriteBatch) { docs := batch.PendingDocs() - require.Equal(t, 1, len(docs)) + require.Equal(t, 1, len(docs), id.String()) require.Equal(t, doc.Metadata{ ID: id.Bytes(), Fields: doc.Fields{{Name: tag.Name.Bytes(), Value: tag.Value.Bytes()}}, @@ -290,38 +298,49 @@ func setupMockBlock( require.Equal(t, 1, len(entries)) require.True(t, entries[0].Timestamp.Equal(ts)) require.True(t, entries[0].OnIndexSeries == lifecycle) // Just ptr equality - }) + }).Times(1) } func createMockBlocks( ctrl *gomock.Controller, blockStart xtime.UnixNano, nextBlockStart xtime.UnixNano, -) (*index.MockBlock, *index.MockBlock, index.NewBlockFn) { - mockBlock := index.NewMockBlock(ctrl) - mockBlock.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() - mockBlock.EXPECT().Close().Return(nil) - mockBlock.EXPECT().StartTime().Return(blockStart).AnyTimes() +) (*index.MockBlock, index.NewBlockFn) { + activeBlock := index.NewMockBlock(ctrl) + activeBlock.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + activeBlock.EXPECT().Close().Return(nil) + activeBlock.EXPECT().StartTime().Return(blockStart).AnyTimes() + + block := index.NewMockBlock(ctrl) + block.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() + block.EXPECT().Close().Return(nil) + block.EXPECT().StartTime().Return(blockStart).AnyTimes() futureBlock := index.NewMockBlock(ctrl) futureBlock.EXPECT().Stats(gomock.Any()).Return(nil).AnyTimes() - futureBlock.EXPECT().Close().Return(nil) futureBlock.EXPECT().StartTime().Return(nextBlockStart).AnyTimes() - var madeBlock, madeFuture bool + var madeActive, madeBlock, madeFuture bool newBlockFn := func( ts xtime.UnixNano, md namespace.Metadata, - _ index.BlockOptions, + opts index.BlockOptions, _ namespace.RuntimeOptionsManager, io index.Options, ) (index.Block, error) { + if opts.ActiveBlock && ts.Equal(xtime.UnixNano(0)) { + if madeActive { + return activeBlock, errors.New("already created active block") + } + madeActive = true + return activeBlock, nil + } if ts.Equal(blockStart) { if madeBlock { - return mockBlock, errors.New("already created initial block") + return block, errors.New("already created initial block") } madeBlock = true - return mockBlock, nil + return block, nil } else if ts.Equal(nextBlockStart) { if madeFuture { return nil, errors.New("already created forward block") @@ -333,7 +352,7 @@ func createMockBlocks( ts, blockStart, nextBlockStart) } - return mockBlock, futureBlock, newBlockFn + return activeBlock, newBlockFn } func TestNamespaceIndexForwardWrite(t *testing.T) { @@ -343,7 +362,7 @@ func TestNamespaceIndexForwardWrite(t *testing.T) { opts, now, blockSize := generateOptionsNowAndBlockSize() blockStart := now.Truncate(blockSize) futureStart := blockStart.Add(blockSize) - mockBlock, futureBlock, newBlockFn := createMockBlocks(ctrl, blockStart, futureStart) + activeBlock, newBlockFn := createMockBlocks(ctrl, blockStart, futureStart) md := testNamespaceMetadata(blockSize, 4*time.Hour) idx, err := newNamespaceIndexWithNewBlockFn(md, @@ -358,7 +377,7 @@ func TestNamespaceIndexForwardWrite(t *testing.T) { id := ident.StringID("foo") tag := ident.StringTag("name", "value") tags := ident.NewTags(tag) - lifecycle := index.NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) var ( ts = idx.(*nsIndex).state.latestBlock.StartTime() @@ -366,10 +385,11 @@ func TestNamespaceIndexForwardWrite(t *testing.T) { ) lifecycle.EXPECT().NeedsIndexUpdate(next).Return(true) - lifecycle.EXPECT().OnIndexPrepare() + lifecycle.EXPECT().OnIndexPrepare(next) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false) - setupMockBlock(t, mockBlock, now, id, tag, lifecycle) - setupMockBlock(t, futureBlock, futureStart, id, tag, lifecycle) + setupMockBlock(t, activeBlock, now, id, tag, lifecycle) + setupMockBlock(t, activeBlock, futureStart, id, tag, lifecycle) batch := index.NewWriteBatch(index.WriteBatchOptions{ IndexBlockSize: blockSize, @@ -385,7 +405,7 @@ func TestNamespaceIndexForwardWriteCreatesBlock(t *testing.T) { opts, now, blockSize := generateOptionsNowAndBlockSize() blockStart := now.Truncate(blockSize) futureStart := blockStart.Add(blockSize) - mockBlock, futureBlock, newBlockFn := createMockBlocks(ctrl, blockStart, futureStart) + activeBlock, newBlockFn := createMockBlocks(ctrl, blockStart, futureStart) md := testNamespaceMetadata(blockSize, 4*time.Hour) idx, err := newNamespaceIndexWithNewBlockFn(md, @@ -400,7 +420,7 @@ func TestNamespaceIndexForwardWriteCreatesBlock(t *testing.T) { id := ident.StringID("foo") tag := ident.StringTag("name", "value") tags := ident.NewTags(tag) - lifecycle := index.NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) var ( ts = idx.(*nsIndex).state.latestBlock.StartTime() @@ -408,10 +428,11 @@ func TestNamespaceIndexForwardWriteCreatesBlock(t *testing.T) { ) lifecycle.EXPECT().NeedsIndexUpdate(next).Return(true) - lifecycle.EXPECT().OnIndexPrepare() + lifecycle.EXPECT().OnIndexPrepare(next) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false) - setupMockBlock(t, mockBlock, now, id, tag, lifecycle) - setupMockBlock(t, futureBlock, futureStart, id, tag, lifecycle) + setupMockBlock(t, activeBlock, now, id, tag, lifecycle) + setupMockBlock(t, activeBlock, futureStart, id, tag, lifecycle) entry, doc := testWriteBatchEntry(id, tags, now, lifecycle) batch := testWriteBatch(entry, doc, testWriteBatchBlockSizeOption(blockSize)) @@ -577,7 +598,7 @@ func testShardForwardWriteTaggedSyncRefCount( // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() require.NoError(t, err) require.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -593,7 +614,7 @@ func testShardForwardWriteTaggedSyncRefCount( // // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() require.NoError(t, err) require.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -637,7 +658,7 @@ func testShardForwardWriteTaggedAsyncRefCount( // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() require.NoError(t, err) require.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -652,7 +673,7 @@ func testShardForwardWriteTaggedAsyncRefCount( // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() require.NoError(t, err) require.Equal(t, int32(0), entry.ReaderWriterCount(), id) diff --git a/src/dbnode/storage/index_queue_test.go b/src/dbnode/storage/index_queue_test.go index 9ec4ef9d8e..cd8790ae20 100644 --- a/src/dbnode/storage/index_queue_test.go +++ b/src/dbnode/storage/index_queue_test.go @@ -145,8 +145,11 @@ func TestNamespaceIndexWriteAfterClose(t *testing.T) { now := xtime.Now() - lifecycle := index.NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) lifecycle.EXPECT().OnIndexFinalize(now.Truncate(idx.blockSize)) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()). + Return(false). + AnyTimes() entry, document := testWriteBatchEntry(id, tags, now, lifecycle) assert.Error(t, idx.WriteBatch(testWriteBatch(entry, document, testWriteBatchBlockSizeOption(idx.blockSize)))) @@ -166,8 +169,9 @@ func TestNamespaceIndexWriteQueueError(t *testing.T) { ) n := xtime.Now() - lifecycle := index.NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) lifecycle.EXPECT().OnIndexFinalize(n.Truncate(idx.blockSize)) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false) q.EXPECT(). InsertBatch(gomock.Any()). Return(nil, fmt.Errorf("random err")) @@ -209,11 +213,14 @@ func TestNamespaceIndexInsertOlderThanRetentionPeriod(t *testing.T) { tags = ident.NewTags( ident.StringTag("name", "value"), ) - lifecycle = index.NewMockOnIndexSeries(ctrl) + lifecycle = doc.NewMockOnIndexSeries(ctrl) ) tooOld := now.Add(-1 * idx.bufferPast).Add(-1 * time.Second) lifecycle.EXPECT().OnIndexFinalize(tooOld.Truncate(idx.blockSize)) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()). + Return(false). + AnyTimes() entry, document := testWriteBatchEntry(id, tags, tooOld, lifecycle) batch := testWriteBatch(entry, document, testWriteBatchBlockSizeOption(idx.blockSize)) @@ -272,8 +279,11 @@ func TestNamespaceIndexInsertQueueInteraction(t *testing.T) { now := xtime.Now() var wg sync.WaitGroup - lifecycle := index.NewMockOnIndexSeries(ctrl) + lifecycle := doc.NewMockOnIndexSeries(ctrl) q.EXPECT().InsertBatch(gomock.Any()).Return(&wg, nil) + lifecycle.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()). + Return(false). + AnyTimes() assert.NoError(t, idx.WriteBatch(testWriteBatch(testWriteBatchEntry(id, tags, now, lifecycle)))) } @@ -281,6 +291,7 @@ func TestNamespaceIndexInsertQueueInteraction(t *testing.T) { func setupIndex(t *testing.T, ctrl *gomock.Controller, now xtime.UnixNano, + expectAggregateQuery bool, ) NamespaceIndex { newFn := func( fn nsIndexInsertBatchFn, @@ -308,11 +319,16 @@ func setupIndex(t *testing.T, tags = ident.NewTags( ident.StringTag("name", "value"), ) - lifecycleFns = index.NewMockOnIndexSeries(ctrl) + lifecycleFns = doc.NewMockOnIndexSeries(ctrl) ) lifecycleFns.EXPECT().OnIndexFinalize(ts) lifecycleFns.EXPECT().OnIndexSuccess(ts) + lifecycleFns.EXPECT().IfAlreadyIndexedMarkIndexSuccessAndFinalize(gomock.Any()).Return(false) + + if !expectAggregateQuery { + lifecycleFns.EXPECT().IndexedForBlockStart(ts).Return(true) + } entry, doc := testWriteBatchEntry(id, tags, now, lifecycleFns) batch := testWriteBatch(entry, doc, testWriteBatchBlockSizeOption(blockSize)) @@ -330,7 +346,7 @@ func TestNamespaceIndexInsertQuery(t *testing.T) { defer ctx.Close() now := xtime.Now() - idx := setupIndex(t, ctrl, now) + idx := setupIndex(t, ctrl, now, false) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -366,7 +382,7 @@ func TestNamespaceIndexInsertAggregateQuery(t *testing.T) { defer ctx.Close() now := xtime.Now() - idx := setupIndex(t, ctrl, now) + idx := setupIndex(t, ctrl, now, true) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -404,7 +420,7 @@ func TestNamespaceIndexInsertWideQuery(t *testing.T) { defer ctx.Close() now := xtime.Now() - idx := setupIndex(t, ctrl, now) + idx := setupIndex(t, ctrl, now, false) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) @@ -451,7 +467,7 @@ func TestNamespaceIndexInsertWideQueryFilteredByShard(t *testing.T) { defer ctx.Close() now := xtime.Now() - idx := setupIndex(t, ctrl, now) + idx := setupIndex(t, ctrl, now, false) defer idx.Close() reQuery, err := m3ninxidx.NewRegexpQuery([]byte("name"), []byte("val.*")) diff --git a/src/dbnode/storage/index_test.go b/src/dbnode/storage/index_test.go index 91e4ea7339..9802a54d82 100644 --- a/src/dbnode/storage/index_test.go +++ b/src/dbnode/storage/index_test.go @@ -644,12 +644,15 @@ func TestNamespaceIndexFlushShardStateNotSuccess(t *testing.T) { mockShard := NewMockdatabaseShard(ctrl) mockShard.EXPECT().IsBootstrapped().Return(true).AnyTimes() mockShard.EXPECT().ID().Return(uint32(0)).AnyTimes() - mockShard.EXPECT().FlushState(gomock.Any()).Return(fileOpState{WarmStatus: fileOpFailed}, nil).AnyTimes() + mockShard.EXPECT().FlushState(gomock.Any()).Return(fileOpState{WarmStatus: warmStatus{ + IndexFlushed: fileOpFailed, + }}, nil).AnyTimes() shards := []databaseShard{mockShard} mockFlush := persist.NewMockIndexFlush(ctrl) - require.NoError(t, idx.WarmFlush(mockFlush, shards)) + err := idx.WarmFlush(mockFlush, shards) + require.NoError(t, err) } func TestNamespaceIndexQueryNoMatchingBlocks(t *testing.T) { @@ -789,7 +792,7 @@ func TestNamespaceIndexFlushSkipBootstrappingShards(t *testing.T) { mockBlock.EXPECT().StartTime().Return(blockTime).AnyTimes() mockBlock.EXPECT().EndTime().Return(blockTime.Add(test.indexBlockSize)).AnyTimes() mockBlock.EXPECT().NeedsColdMutableSegmentsEvicted().Return(true).AnyTimes() - mockBlock.EXPECT().RotateColdMutableSegments().Return().AnyTimes() + mockBlock.EXPECT().RotateColdMutableSegments().Return(nil).AnyTimes() mockBlock.EXPECT().EvictColdMutableSegments().Return(nil).AnyTimes() idx.state.blocksByTime[blockTime] = mockBlock @@ -811,7 +814,9 @@ func TestNamespaceIndexFlushSkipBootstrappingShards(t *testing.T) { mockShard.EXPECT().IsBootstrapped().Return(shardInfo.isBootstrapped).AnyTimes() mockShard.EXPECT().ID().Return(shardInfo.id).AnyTimes() if shardInfo.isBootstrapped { - mockShard.EXPECT().FlushState(gomock.Any()).Return(fileOpState{WarmStatus: fileOpSuccess}, nil).AnyTimes() + mockShard.EXPECT().FlushState(gomock.Any()).Return(fileOpState{WarmStatus: warmStatus{ + IndexFlushed: fileOpSuccess, + }}, nil).AnyTimes() } shards = append(shards, mockShard) } @@ -902,8 +907,14 @@ func verifyFlushForShards( for _, mockShard := range mockShards { mockShard.EXPECT().IsBootstrapped().Return(true) - mockShard.EXPECT().FlushState(blockStart).Return(fileOpState{WarmStatus: fileOpSuccess}, nil) - mockShard.EXPECT().FlushState(blockStart.Add(blockSize)).Return(fileOpState{WarmStatus: fileOpSuccess}, nil) + mockShard.EXPECT().FlushState(blockStart).Return(fileOpState{WarmStatus: warmStatus{ + // Index flushing requires data flush already happened. + DataFlushed: fileOpSuccess, + }}, nil) + mockShard.EXPECT().FlushState(blockStart.Add(blockSize)).Return(fileOpState{WarmStatus: warmStatus{ + // Index flushing requires data flush already happened. + DataFlushed: fileOpSuccess, + }}, nil) resultsTags1 := ident.NewTagsIterator(ident.NewTags()) resultsTags2 := ident.NewTagsIterator(ident.NewTags()) @@ -925,13 +936,19 @@ func verifyFlushForShards( mockShard.EXPECT().FetchBlocksMetadataV2(gomock.Any(), blockStart, blockStart.Add(idx.blockSize), gomock.Any(), gomock.Any(), block.FetchBlocksMetadataOptions{OnlyDisk: true}).Return(results, nil, nil) + + // For a given index block, which in this test is 2x the size of a block, we expect that + // we mark as flushed 2 blockStarts that fall within the index block. + mockShard.EXPECT().MarkWarmIndexFlushStateSuccessOrError(blockStart, nil) + mockShard.EXPECT().MarkWarmIndexFlushStateSuccessOrError(blockStart.Add(blockSize), nil) } mockBlock.EXPECT().IsSealed().Return(true) mockBlock.EXPECT().AddResults(gomock.Any()).Return(nil) mockBlock.EXPECT().EvictMutableSegments().Return(nil) } - require.NoError(t, idx.WarmFlush(mockFlush, dbShards)) + err := idx.WarmFlush(mockFlush, dbShards) + require.NoError(t, err) require.Equal(t, numBlocks, persistClosedTimes) require.Equal(t, numBlocks, persistCalledTimes) require.Equal(t, expectedDocs, actualDocs) diff --git a/src/dbnode/storage/series/lookup/lookup_mock.go b/src/dbnode/storage/lookup_mock.go similarity index 95% rename from src/dbnode/storage/series/lookup/lookup_mock.go rename to src/dbnode/storage/lookup_mock.go index 77e76f9073..57f9dd3a25 100644 --- a/src/dbnode/storage/series/lookup/lookup_mock.go +++ b/src/dbnode/storage/lookup_mock.go @@ -1,5 +1,5 @@ // Code generated by MockGen. DO NOT EDIT. -// Source: github.com/m3db/m3/src/dbnode/storage/series/lookup (interfaces: IndexWriter) +// Source: github.com/m3db/m3/src/dbnode/storage (interfaces: IndexWriter) // Copyright (c) 2021 Uber Technologies, Inc. // @@ -21,8 +21,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -// Package lookup is a generated GoMock package. -package lookup +// Package storage is a generated GoMock package. +package storage import ( "reflect" diff --git a/src/dbnode/storage/namespace.go b/src/dbnode/storage/namespace.go index 0cd2a3b63b..3a57c4104f 100644 --- a/src/dbnode/storage/namespace.go +++ b/src/dbnode/storage/namespace.go @@ -1221,7 +1221,7 @@ func (n *dbNamespace) WarmFlush( return err } // skip flushing if the shard has already flushed data for the `blockStart` - if flushState.WarmStatus == fileOpSuccess { + if flushState.WarmStatus.DataFlushed == fileOpSuccess { continue } @@ -1499,7 +1499,7 @@ func (n *dbNamespace) needsFlushWithLock( if err != nil { return false, err } - if flushState.WarmStatus != fileOpSuccess { + if flushState.WarmStatus.DataFlushed != fileOpSuccess { return true, nil } } diff --git a/src/dbnode/storage/namespace_test.go b/src/dbnode/storage/namespace_test.go index 74abfcc679..435b7517c7 100644 --- a/src/dbnode/storage/namespace_test.go +++ b/src/dbnode/storage/namespace_test.go @@ -607,7 +607,8 @@ func testNamespaceBootstrapUnfulfilledShards( func TestNamespaceFlushNotBootstrapped(t *testing.T) { ns, closer := newTestNamespace(t) defer closer() - require.Equal(t, errNamespaceNotBootstrapped, ns.WarmFlush(xtime.Now(), nil)) + err := ns.WarmFlush(xtime.Now(), nil) + require.Equal(t, errNamespaceNotBootstrapped, err) require.Equal(t, errNamespaceNotBootstrapped, ns.ColdFlush(nil)) } @@ -617,7 +618,8 @@ func TestNamespaceFlushDontNeedFlush(t *testing.T) { defer close() ns.bootstrapState = Bootstrapped - require.NoError(t, ns.WarmFlush(xtime.Now(), nil)) + err := ns.WarmFlush(xtime.Now(), nil) + require.NoError(t, err) require.NoError(t, ns.ColdFlush(nil)) } @@ -627,7 +629,8 @@ func TestNamespaceSkipFlushIfReadOnly(t *testing.T) { ns.bootstrapState = Bootstrapped ns.SetReadOnly(true) - require.NoError(t, ns.WarmFlush(xtime.Now(), nil)) + err := ns.WarmFlush(xtime.Now(), nil) + require.NoError(t, err) require.NoError(t, ns.ColdFlush(nil)) } @@ -645,20 +648,21 @@ func TestNamespaceFlushSkipFlushed(t *testing.T) { blockStart := xtime.Now().Truncate(ns.Options().RetentionOptions().BlockSize()) states := []fileOpState{ - {WarmStatus: fileOpNotStarted}, - {WarmStatus: fileOpSuccess}, + {WarmStatus: warmStatus{DataFlushed: fileOpNotStarted}}, + {WarmStatus: warmStatus{DataFlushed: fileOpSuccess}}, } for i, s := range states { shard := NewMockdatabaseShard(ctrl) shard.EXPECT().IsBootstrapped().Return(true).AnyTimes() shard.EXPECT().FlushState(blockStart).Return(s, nil) - if s.WarmStatus != fileOpSuccess { + if s.WarmStatus.DataFlushed != fileOpSuccess { shard.EXPECT().WarmFlush(blockStart, gomock.Any(), gomock.Any()).Return(nil) } ns.shards[testShardIDs[i].ID()] = shard } - require.NoError(t, ns.WarmFlush(blockStart, nil)) + err := ns.WarmFlush(blockStart, nil) + require.NoError(t, err) } func TestNamespaceFlushSkipShardNotBootstrapped(t *testing.T) { @@ -679,7 +683,8 @@ func TestNamespaceFlushSkipShardNotBootstrapped(t *testing.T) { shard.EXPECT().IsBootstrapped().Return(false) ns.shards[testShardIDs[0].ID()] = shard - require.NoError(t, ns.WarmFlush(blockStart, nil)) + err := ns.WarmFlush(blockStart, nil) + require.NoError(t, err) require.NoError(t, ns.ColdFlush(nil)) } @@ -1007,11 +1012,15 @@ func setShardExpects(ns *dbNamespace, ctrl *gomock.Controller, cases []needsFlus for t, needFlush := range cs.needsFlush { if needFlush { shard.EXPECT().FlushState(t).Return(fileOpState{ - WarmStatus: fileOpNotStarted, + WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + }, }, nil).AnyTimes() } else { shard.EXPECT().FlushState(t).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() } } @@ -1140,7 +1149,9 @@ func TestNamespaceNeedsFlushAllSuccess(t *testing.T) { shard := NewMockdatabaseShard(ctrl) shard.EXPECT().ID().Return(s.ID()).AnyTimes() shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() ns.shards[s.ID()] = shard } @@ -1184,15 +1195,21 @@ func TestNamespaceNeedsFlushAnyFailed(t *testing.T) { switch shard.ID() { case shards[0].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() case shards[1].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() case shards[2].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpFailed, + WarmStatus: warmStatus{ + DataFlushed: fileOpFailed, + }, NumFailures: 999, }, nil).AnyTimes() } @@ -1238,15 +1255,21 @@ func TestNamespaceNeedsFlushAnyNotStarted(t *testing.T) { switch shard.ID() { case shards[0].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() case shards[1].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpNotStarted, + WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + }, }, nil).AnyTimes() case shards[2].ID(): shard.EXPECT().FlushState(blockStart).Return(fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, }, nil).AnyTimes() } ns.shards[s.ID()] = shard diff --git a/src/dbnode/storage/series_resolver.go b/src/dbnode/storage/series_resolver.go index 8e3b2798c5..a425a2f180 100644 --- a/src/dbnode/storage/series_resolver.go +++ b/src/dbnode/storage/series_resolver.go @@ -25,12 +25,11 @@ import ( "sync" "github.com/m3db/m3/src/dbnode/storage/bootstrap" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/x/ident" ) // retrieveWritableSeriesFn represents the function to retrieve series entry. -type retrieveWritableSeriesFn func(id ident.ID) (*lookup.Entry, error) +type retrieveWritableSeriesFn func(id ident.ID) (*Entry, error) type seriesResolver struct { sync.RWMutex @@ -41,7 +40,7 @@ type seriesResolver struct { resolved bool resolvedErr error - entry *lookup.Entry + entry *Entry } // NewSeriesResolver creates new series ref resolver. diff --git a/src/dbnode/storage/series_resolver_test.go b/src/dbnode/storage/series_resolver_test.go index b55e6581f6..99b9198b7a 100644 --- a/src/dbnode/storage/series_resolver_test.go +++ b/src/dbnode/storage/series_resolver_test.go @@ -27,14 +27,13 @@ import ( "github.com/stretchr/testify/require" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/x/ident" ) func TestResolveError(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { return nil, fmt.Errorf("unable to resolve series") }) _, err := sut.SeriesRef() @@ -44,7 +43,7 @@ func TestResolveError(t *testing.T) { func TestResolveNilEntry(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { return nil, nil }) _, err := sut.SeriesRef() @@ -54,53 +53,53 @@ func TestResolveNilEntry(t *testing.T) { func TestResolve(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { - return lookup.NewEntry(lookup.NewEntryOptions{ + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { + return NewEntry(NewEntryOptions{ Index: 11, }), nil }) seriesRef, err := sut.SeriesRef() require.NoError(t, err) - require.IsType(t, &lookup.Entry{}, seriesRef) - entry := seriesRef.(*lookup.Entry) + require.IsType(t, &Entry{}, seriesRef) + entry := seriesRef.(*Entry) require.Equal(t, uint64(11), entry.Index) } func TestSecondResolveWontWait(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { - return lookup.NewEntry(lookup.NewEntryOptions{ + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { + return NewEntry(NewEntryOptions{ Index: 11, }), nil }) seriesRef, err := sut.SeriesRef() require.NoError(t, err) - require.IsType(t, &lookup.Entry{}, seriesRef) - entry := seriesRef.(*lookup.Entry) + require.IsType(t, &Entry{}, seriesRef) + entry := seriesRef.(*Entry) require.Equal(t, uint64(11), entry.Index) wg.Add(1) seriesRef2, err := sut.SeriesRef() require.NoError(t, err) - require.IsType(t, &lookup.Entry{}, seriesRef2) - entry2 := seriesRef2.(*lookup.Entry) + require.IsType(t, &Entry{}, seriesRef2) + entry2 := seriesRef2.(*Entry) require.Equal(t, entry, entry2) } func TestReleaseRef(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { - entry := lookup.NewEntry(lookup.NewEntryOptions{}) + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { + entry := NewEntry(NewEntryOptions{}) entry.IncrementReaderWriterCount() return entry, nil }) seriesRef, err := sut.SeriesRef() require.NoError(t, err) - require.IsType(t, &lookup.Entry{}, seriesRef) + require.IsType(t, &Entry{}, seriesRef) - entry := seriesRef.(*lookup.Entry) + entry := seriesRef.(*Entry) require.Equal(t, int32(1), entry.ReaderWriterCount()) err = sut.ReleaseRef() require.NoError(t, err) @@ -110,7 +109,7 @@ func TestReleaseRef(t *testing.T) { func TestReleaseRefError(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { return nil, fmt.Errorf("unable to resolve series") }) err := sut.ReleaseRef() @@ -120,8 +119,8 @@ func TestReleaseRefError(t *testing.T) { func TestReleaseRefWithoutSeriesRef(t *testing.T) { wg := sync.WaitGroup{} id := ident.StringID("foo") - sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*lookup.Entry, error) { - entry := lookup.NewEntry(lookup.NewEntryOptions{}) + sut := NewSeriesResolver(&wg, id, func(id ident.ID) (*Entry, error) { + entry := NewEntry(NewEntryOptions{}) entry.IncrementReaderWriterCount() return entry, nil }) diff --git a/src/dbnode/storage/series_wired_list_interaction_test.go b/src/dbnode/storage/series_wired_list_interaction_test.go index 52a2b6ab04..ef242c7080 100644 --- a/src/dbnode/storage/series_wired_list_interaction_test.go +++ b/src/dbnode/storage/series_wired_list_interaction_test.go @@ -29,7 +29,6 @@ import ( "github.com/m3db/m3/src/dbnode/runtime" "github.com/m3db/m3/src/dbnode/storage/block" "github.com/m3db/m3/src/dbnode/storage/series" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/x/clock" "github.com/m3db/m3/src/x/context" @@ -116,7 +115,7 @@ func TestSeriesWiredListConcurrentInteractions(t *testing.T) { require.NoError(t, err) shard.Lock() - shard.insertNewShardEntryWithLock(lookup.NewEntry(lookup.NewEntryOptions{ + shard.insertNewShardEntryWithLock(NewEntry(NewEntryOptions{ Series: seriesEntry, })) shard.Unlock() diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index c7d9c464c3..a41822d126 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -43,7 +43,6 @@ import ( "github.com/m3db/m3/src/dbnode/storage/index/convert" "github.com/m3db/m3/src/dbnode/storage/repair" "github.com/m3db/m3/src/dbnode/storage/series" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/dbnode/tracepoint" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/dbnode/ts/writes" @@ -152,6 +151,7 @@ type dbShard struct { ticking bool shard uint32 coldWritesEnabled bool + indexEnabled bool } // NB(r): dbShardRuntimeOptions does not contain its own @@ -223,9 +223,9 @@ func newDatabaseShardMetrics(shardID uint32, scope tally.Scope) dbShardMetrics { } } -type dbShardEntryWorkFn func(entry *lookup.Entry) bool +type dbShardEntryWorkFn func(entry *Entry) bool -type dbShardEntryBatchWorkFn func(entries []*lookup.Entry) bool +type dbShardEntryBatchWorkFn func(entries []*Entry) bool type shardListElement *list.Element @@ -281,6 +281,7 @@ func newDatabaseShard( flushState: newShardFlushState(), tickWg: &sync.WaitGroup{}, coldWritesEnabled: namespaceMetadata.Options().ColdWritesEnabled(), + indexEnabled: namespaceMetadata.Options().IndexOptions().Enabled(), logger: opts.InstrumentOptions().Logger(), metrics: newDatabaseShardMetrics(shard, scope), tileAggregator: opts.TileAggregator(), @@ -376,7 +377,21 @@ func (s *dbShard) hasWarmFlushed(blockStart xtime.UnixNano) (bool, error) { if err != nil { return false, err } - return statusIsRetrievable(flushState.WarmStatus), nil + return s.warmStatusIsRetrievable(flushState.WarmStatus), nil +} + +func (s *dbShard) warmStatusIsRetrievable(status warmStatus) bool { + if !statusIsRetrievable(status.DataFlushed) { + return false + } + + // If the index is disabled, then we only are tracking data flushing. + // Otherwise, warm status requires both data and index flushed. + if !s.indexEnabled { + return true + } + + return statusIsRetrievable(status.IndexFlushed) } func statusIsRetrievable(status fileOpStatus) bool { @@ -425,7 +440,7 @@ func (s *dbShard) blockStatesSnapshotWithRLock() series.ShardBlockStateSnapshot snapshot := make(map[xtime.UnixNano]series.BlockState, len(s.flushState.statesByTime)) for time, state := range s.flushState.statesByTime { snapshot[time] = series.BlockState{ - WarmRetrievable: statusIsRetrievable(state.WarmStatus), + WarmRetrievable: s.warmStatusIsRetrievable(state.WarmStatus), // Use ColdVersionRetrievable instead of ColdVersionFlushed since the snapshot // will be used to make eviction decisions and we don't want to evict data before // it is retrievable. @@ -446,7 +461,7 @@ func (s *dbShard) OnRetrieveBlock( nsCtx namespace.Context, ) { s.RLock() - entry, _, err := s.lookupEntryWithLock(id) + entry, err := s.lookupEntryWithLock(id) if entry != nil { entry.IncrementReaderWriterCount() defer entry.DecrementReaderWriterCount() @@ -500,7 +515,7 @@ func (s *dbShard) OnRetrieveBlock( func (s *dbShard) OnEvictedFromWiredList(id ident.ID, blockStart xtime.UnixNano) { s.RLock() - entry, _, err := s.lookupEntryWithLock(id) + entry, err := s.lookupEntryWithLock(id) s.RUnlock() if err != nil && err != errShardEntryNotFound { @@ -524,8 +539,8 @@ func (s *dbShard) OnEvictedFromWiredList(id ident.ID, blockStart xtime.UnixNano) entry.Series.OnEvictedFromWiredList(id, blockStart) } -func (s *dbShard) forEachShardEntry(entryFn dbShardEntryWorkFn) error { - return s.forEachShardEntryBatch(func(currEntries []*lookup.Entry) bool { +func (s *dbShard) forEachShardEntry(entryFn dbShardEntryWorkFn) { + s.forEachShardEntryBatch(func(currEntries []*Entry) bool { for _, entry := range currEntries { if continueForEach := entryFn(entry); !continueForEach { return false @@ -543,7 +558,7 @@ func iterateBatchSize(elemsLen int) int { return int(math.Max(shardIterateBatchMinSize, t)) } -func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) error { +func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) { // NB(r): consider using a lockless list for ticking. s.RLock() elemsLen := s.list.Len() @@ -554,11 +569,11 @@ func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) if e == nil { return } - e.Value.(*lookup.Entry).DecrementReaderWriterCount() + e.Value.(*Entry).DecrementReaderWriterCount() } var ( - currEntries = make([]*lookup.Entry, 0, batchSize) + currEntries = make([]*Entry, 0, batchSize) first = true nextElem *list.Element ) @@ -578,7 +593,7 @@ func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) elem := nextElem for ticked := 0; ticked < batchSize && elem != nil; ticked++ { nextElem = elem.Next() - entry := elem.Value.(*lookup.Entry) + entry := elem.Value.(*Entry) entry.IncrementReaderWriterCount() currEntries = append(currEntries, entry) elem = nextElem @@ -587,7 +602,7 @@ func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) // NB(prateek): inc a reference to the next element while we have a lock, // to guarantee the element pointer cannot be changed from under us. if nextElem != nil { - nextElem.Value.(*lookup.Entry).IncrementReaderWriterCount() + nextElem.Value.(*Entry).IncrementReaderWriterCount() } s.RUnlock() @@ -599,11 +614,9 @@ func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) currEntries = currEntries[:0] if !continueExecution { decRefElem(nextElem) - return nil + return } } - - return nil } func (s *dbShard) IsBootstrapped() bool { @@ -707,7 +720,7 @@ func (s *dbShard) tickAndExpire( terminatedTickingDueToClosing bool i int slept time.Duration - expired []*lookup.Entry + expired []*Entry ) s.RLock() tickSleepBatch := s.currRuntimeOptions.tickSleepSeriesBatchSize @@ -718,7 +731,7 @@ func (s *dbShard) tickAndExpire( // future read lock attempts. blockStates := s.blockStatesSnapshotWithRLock() s.RUnlock() - s.forEachShardEntryBatch(func(currEntries []*lookup.Entry) bool { + s.forEachShardEntryBatch(func(currEntries []*Entry) bool { // re-using `expired` to amortize allocs, still need to reset it // to be safe for re-use. for i := range expired { @@ -802,10 +815,17 @@ func (s *dbShard) tickAndExpire( // Currently, this function is only called by the lambda inside `tickAndExpire`'s `forEachShardEntryBatch` // call. This satisfies the contract of all entries it operating upon being guaranteed to have a // readerWriterEntryCount of at least 1, by virtue of the implementation of `forEachShardEntryBatch`. -func (s *dbShard) purgeExpiredSeries(expiredEntries []*lookup.Entry) { +func (s *dbShard) purgeExpiredSeries(expiredEntries []*Entry) { // Remove all expired series from lookup and list. s.Lock() for _, entry := range expiredEntries { + // Only purge series after they've been GCed from the index, so that these happen and in order + // and there is no raciness around GCing something from the index when the series has already + // been removed from memory. + if s.indexEnabled && !entry.IndexGarbageCollected.Load() { + continue + } + series := entry.Series id := series.ID() elem, exists := s.lookup.Get(id) @@ -833,6 +853,7 @@ func (s *dbShard) purgeExpiredSeries(expiredEntries []*lookup.Entry) { if !series.IsEmpty() { continue } + // NB(xichen): if we get here, we are guaranteed that there can be // no more reads/writes to this series while the lock is held, so it's // safe to remove it. @@ -882,7 +903,7 @@ func (s *dbShard) writeAndIndex( shouldReverseIndex bool, ) (SeriesWrite, error) { // Prepare write - entry, opts, err := s.tryRetrieveWritableSeries(id) + entry, opts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) if err != nil { return SeriesWrite{}, err } @@ -890,7 +911,7 @@ func (s *dbShard) writeAndIndex( writable := entry != nil // If no entry and we are not writing new series asynchronously. - if !writable && !opts.writeNewSeriesAsync { + if !writable && !opts.WriteNewSeriesAsync { // Avoid double lookup by enqueueing insert immediately. result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{ hasPendingIndexing: shouldReverseIndex, @@ -942,7 +963,7 @@ func (s *dbShard) writeAndIndex( commitLogSeriesUniqueIndex = entry.Index if err == nil && shouldReverseIndex { if entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) { - if !opts.writeNewSeriesAsync { + if !opts.WriteNewSeriesAsync { return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled") } needsIndex = true @@ -981,7 +1002,7 @@ func (s *dbShard) writeAndIndex( } if shouldReverseIndex { - if !opts.writeNewSeriesAsync { + if !opts.WriteNewSeriesAsync { return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled") } needsIndex = true @@ -1062,7 +1083,7 @@ func (s *dbShard) ReadEncoded( nsCtx namespace.Context, ) (series.BlockReaderIter, error) { s.RLock() - entry, _, err := s.lookupEntryWithLock(id) + entry, err := s.lookupEntryWithLock(id) if entry != nil { // NB(r): Ensure readers have consistent view of this series, do // not expire the series while being read from. @@ -1107,20 +1128,20 @@ func (s *dbShard) FetchWideEntry( } // lookupEntryWithLock returns the entry for a given id while holding a read lock or a write lock. -func (s *dbShard) lookupEntryWithLock(id ident.ID) (*lookup.Entry, *list.Element, error) { +func (s *dbShard) lookupEntryWithLock(id ident.ID) (*Entry, error) { if s.state != dbShardStateOpen { // NB(r): Return an invalid params error here so any upstream // callers will not retry this operation - return nil, nil, xerrors.NewInvalidParamsError(errShardNotOpen) + return nil, xerrors.NewInvalidParamsError(errShardNotOpen) } elem, exists := s.lookup.Get(id) if !exists { - return nil, nil, errShardEntryNotFound + return nil, errShardEntryNotFound } - return elem.Value.(*lookup.Entry), elem, nil + return elem.Value.(*Entry), nil } -func (s *dbShard) writableSeries(id ident.ID, tagResolver convert.TagMetadataResolver) (*lookup.Entry, error) { +func (s *dbShard) writableSeries(id ident.ID, tagResolver convert.TagMetadataResolver) (*Entry, error) { for { entry, err := s.retrieveWritableSeries(id) if entry != nil { @@ -1141,20 +1162,25 @@ func (s *dbShard) writableSeries(id ident.ID, tagResolver convert.TagMetadataRes } } -type writableSeriesOptions struct { - writeNewSeriesAsync bool +// WritableSeriesOptions defines writable series options. +type WritableSeriesOptions struct { + // WriteNewSeriesAsync specifies if the series should be async written. + WriteNewSeriesAsync bool } -func (s *dbShard) tryRetrieveWritableSeries(id ident.ID) ( - *lookup.Entry, - writableSeriesOptions, +// TryRetrieveSeriesAndIncrementReaderWriterCount attempts to retrieve a writable series. +// This increments the reader/writer count and so should be decremented when the series +// is no longer held. +func (s *dbShard) TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) ( + *Entry, + WritableSeriesOptions, error, ) { s.RLock() - opts := writableSeriesOptions{ - writeNewSeriesAsync: s.currRuntimeOptions.writeNewSeriesAsync, + opts := WritableSeriesOptions{ + WriteNewSeriesAsync: s.currRuntimeOptions.writeNewSeriesAsync, } - if entry, _, err := s.lookupEntryWithLock(id); err == nil { + if entry, err := s.lookupEntryWithLock(id); err == nil { entry.IncrementReaderWriterCount() s.RUnlock() return entry, opts, nil @@ -1166,15 +1192,15 @@ func (s *dbShard) tryRetrieveWritableSeries(id ident.ID) ( return nil, opts, nil } -func (s *dbShard) retrieveWritableSeries(id ident.ID) (*lookup.Entry, error) { - entry, _, err := s.tryRetrieveWritableSeries(id) +func (s *dbShard) retrieveWritableSeries(id ident.ID) (*Entry, error) { + entry, _, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) return entry, err } func (s *dbShard) newShardEntry( id ident.ID, tagResolver convert.TagMetadataResolver, -) (*lookup.Entry, error) { +) (*Entry, error) { // NB(r): As documented in storage/series.DatabaseSeries the series IDs // and metadata are garbage collected, hence we cast the ID to a BytesID // that can't be finalized. @@ -1210,7 +1236,8 @@ func (s *dbShard) newShardEntry( OnEvictedFromWiredList: s, Options: s.seriesOpts, }) - return lookup.NewEntry(lookup.NewEntryOptions{ + return NewEntry(NewEntryOptions{ + Shard: s, Series: newSeries, Index: uniqueIndex, IndexWriter: s.reverseIndex, @@ -1224,15 +1251,15 @@ type insertAsyncResult struct { // entry is not guaranteed to be the final entry // inserted into the shard map in case there is already // an existing entry waiting in the insert queue - entry *lookup.Entry + entry *Entry } func (s *dbShard) pendingIndexInsert( - entry *lookup.Entry, + entry *Entry, timestamp xtime.UnixNano, ) writes.PendingIndexInsert { // inc a ref on the entry to ensure it's valid until the queue acts upon it. - entry.OnIndexPrepare() + entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(timestamp)) return writes.PendingIndexInsert{ Entry: index.WriteBatchEntry{ Timestamp: timestamp, @@ -1244,13 +1271,13 @@ func (s *dbShard) pendingIndexInsert( } func (s *dbShard) insertSeriesForIndexingAsyncBatched( - entry *lookup.Entry, + entry *Entry, timestamp xtime.UnixNano, async bool, ) error { indexBlockStart := s.reverseIndex.BlockStartForWriteTime(timestamp) // inc a ref on the entry to ensure it's valid until the queue acts upon it. - entry.OnIndexPrepare() + entry.OnIndexPrepare(indexBlockStart) wg, err := s.insertQueue.Insert(dbShardInsert{ entry: entry, opts: dbShardInsertAsyncOptions{ @@ -1328,7 +1355,7 @@ func (s *dbShard) insertSeriesSync( id ident.ID, tagResolver convert.TagMetadataResolver, opts insertSyncOptions, -) (*lookup.Entry, error) { +) (*Entry, error) { // NB(r): Create new shard entry outside of write lock to reduce // time using write lock. newEntry, err := s.newShardEntry(id, tagResolver) @@ -1351,7 +1378,7 @@ func (s *dbShard) insertSeriesSync( } }() - existingEntry, _, err := s.lookupEntryWithLock(id) + existingEntry, err := s.lookupEntryWithLock(id) if err != nil && err != errShardEntryNotFound { // Shard not taking inserts likely. return nil, err @@ -1393,7 +1420,7 @@ func (s *dbShard) insertSeriesSync( return newEntry, nil } -func (s *dbShard) insertNewShardEntryWithLock(entry *lookup.Entry) { +func (s *dbShard) insertNewShardEntryWithLock(entry *Entry) { // Set the lookup value, we use the copied ID and since it is GC'd // we explicitly set it with options to not copy the key and not to // finalize it. @@ -1436,7 +1463,7 @@ func (s *dbShard) insertSeriesBatch(inserts []dbShardInsert) error { // i.e. we don't have a ref on provided entry, so we check if between the operation being // enqueue in the shard insert queue, and this function executing, an entry was created // for the same ID. - entry, _, err := s.lookupEntryWithLock(inserts[i].entry.Series.ID()) + entry, err := s.lookupEntryWithLock(inserts[i].entry.Series.ID()) if entry != nil { // Already exists so update the entry we're pointed at for this insert. inserts[i].entry = entry @@ -1524,7 +1551,7 @@ func (s *dbShard) insertSeriesBatch(inserts []dbShardInsert) error { pendingIndex := inserts[i].opts.pendingIndex // increment the ref on the entry, as the original one was transferred to the // this method (insertSeriesBatch) via `releaseEntryRef` mechanism. - entry.OnIndexPrepare() + entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(pendingIndex.timestamp)) writeBatchEntry := index.WriteBatchEntry{ Timestamp: pendingIndex.timestamp, @@ -1570,7 +1597,7 @@ func (s *dbShard) FetchBlocks( nsCtx namespace.Context, ) ([]block.FetchBlockResult, error) { s.RLock() - entry, _, err := s.lookupEntryWithLock(id) + entry, err := s.lookupEntryWithLock(id) if entry != nil { // NB(r): Ensure readers have consistent view of this series, do // not expire the series while being read from. @@ -1611,7 +1638,7 @@ func (s *dbShard) FetchBlocksForColdFlush( nsCtx namespace.Context, ) (block.FetchBlockResult, error) { s.RLock() - entry, _, err := s.lookupEntryWithLock(seriesID) + entry, err := s.lookupEntryWithLock(seriesID) s.RUnlock() if entry == nil || err != nil { return block.FetchBlockResult{}, err @@ -1634,7 +1661,7 @@ func (s *dbShard) fetchActiveBlocksMetadata( ) var loopErr error - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { // Break out of the iteration loop once we've accumulated enough entries. if int64(len(res.Results())) >= limit { next := int64(entry.Index) @@ -1944,8 +1971,9 @@ func (s *dbShard) UpdateFlushStates() { info := result.Info at := xtime.UnixNano(info.BlockStart) currState := s.flushStateNoBootstrapCheck(at) - if currState.WarmStatus != fileOpSuccess { - s.markWarmFlushStateSuccess(at) + + if currState.WarmStatus.DataFlushed != fileOpSuccess { + s.markWarmDataFlushStateSuccess(at) } // Cold version needs to get bootstrapped so that the 1:1 relationship @@ -1960,6 +1988,27 @@ func (s *dbShard) UpdateFlushStates() { s.setFlushStateColdVersionFlushed(at, info.VolumeIndex) } } + + // Populate index flush state only if enabled. + if !s.indexEnabled { + return + } + + blockSize := s.namespace.Options().RetentionOptions().BlockSize() + indexBlockSize := s.namespace.Options().IndexOptions().BlockSize() + + indexFlushedBlockStarts := s.reverseIndex.WarmFlushBlockStarts() + for _, blockStart := range indexFlushedBlockStarts { + // Index block size is wider than data block size, so we want to set all data blockStarts + // within the range of a given index blockStart + blockEnd := blockStart.Add(indexBlockSize) + for at := blockStart; at < blockEnd; at = at.Add(blockSize) { + currState := s.flushStateNoBootstrapCheck(at) + if currState.WarmStatus.IndexFlushed != fileOpSuccess { + s.markWarmIndexFlushStateSuccess(at) + } + } + } } func (s *dbShard) Bootstrap( @@ -2000,7 +2049,7 @@ func (s *dbShard) Bootstrap( } // Move any bootstrap buffers into position for reading. - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { if err := entry.Series.Bootstrap(nsCtx); err != nil { multiErr = multiErr.Add(err) } @@ -2076,7 +2125,7 @@ func (s *dbShard) loadBlock( ) // First lookup if series already exists. - entry, shardOpts, err := s.tryRetrieveWritableSeries(id) + entry, shardOpts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) if err != nil && err != errShardEntryNotFound { return result, err } @@ -2132,7 +2181,7 @@ func (s *dbShard) loadBlock( if s.reverseIndex != nil && entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) { err = s.insertSeriesForIndexingAsyncBatched(entry, timestamp, - shardOpts.writeNewSeriesAsync) + shardOpts.WriteNewSeriesAsync) if err != nil { return result, err } @@ -2190,14 +2239,14 @@ func (s *dbShard) WarmFlush( } prepared, err := flushPreparer.PrepareData(prepareOpts) if err != nil { - return s.markWarmFlushStateSuccessOrError(blockStart, err) + return err } var multiErr xerrors.MultiError flushCtx := s.contextPool.Get() // From pool so finalizers are from pool. flushResult := dbShardFlushResult{} - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { curr := entry.Series // Use a temporary context here so the stream readers can be returned to // the pool after we finish fetching flushing the series. @@ -2224,7 +2273,7 @@ func (s *dbShard) WarmFlush( multiErr = multiErr.Add(err) } - return s.markWarmFlushStateSuccessOrError(blockStart, multiErr.FinalError()) + return s.markWarmDataFlushStateSuccessOrError(blockStart, multiErr.FinalError()) } func (s *dbShard) ColdFlush( @@ -2263,7 +2312,7 @@ func (s *dbShard) ColdFlush( ) // First, loop through all series to capture data on which blocks have dirty // series and add them to the resources for further processing. - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { curr := entry.Series seriesMetadata := curr.Metadata() blockStarts := curr.ColdFlushBlockStarts(blockStatesSnapshot) @@ -2373,7 +2422,7 @@ func (s *dbShard) Snapshot( var needsSnapshot bool checkNeedsSnapshotTimer := s.metrics.snapshotCheckNeedsSnapshotLatency.Start() - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { if !entry.Series.IsBufferEmptyAtBlockStart(blockStart) { needsSnapshot = true return false @@ -2413,7 +2462,7 @@ func (s *dbShard) Snapshot( stats series.SnapshotResultStats multiErr xerrors.MultiError ) - s.forEachShardEntry(func(entry *lookup.Entry) bool { + s.forEachShardEntry(func(entry *Entry) bool { series := entry.Series // Use a temporary context here so the stream readers can be returned to // pool after we finish fetching flushing the series @@ -2481,34 +2530,64 @@ func (s *dbShard) flushStateNoBootstrapCheck(blockStart xtime.UnixNano) fileOpSt func (s *dbShard) flushStateWithRLock(blockStart xtime.UnixNano) fileOpState { state, ok := s.flushState.statesByTime[blockStart] if !ok { - return fileOpState{WarmStatus: fileOpNotStarted} + return fileOpState{WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + IndexFlushed: fileOpNotStarted, + }} } return state } -func (s *dbShard) markWarmFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) error { +func (s *dbShard) markWarmDataFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) error { // Track flush state for block state if err == nil { - s.markWarmFlushStateSuccess(blockStart) + s.markWarmDataFlushStateSuccess(blockStart) } else { - s.markWarmFlushStateFail(blockStart) + s.markWarmDataFlushStateFail(blockStart) } return err } -func (s *dbShard) markWarmFlushStateSuccess(blockStart xtime.UnixNano) { +func (s *dbShard) markWarmDataFlushStateSuccess(blockStart xtime.UnixNano) { s.flushState.Lock() - s.flushState.statesByTime[blockStart] = - fileOpState{ - WarmStatus: fileOpSuccess, - } + state := s.flushState.statesByTime[blockStart] + state.WarmStatus.DataFlushed = fileOpSuccess + s.flushState.statesByTime[blockStart] = state + s.flushState.Unlock() +} + +func (s *dbShard) markWarmDataFlushStateFail(blockStart xtime.UnixNano) { + s.flushState.Lock() + state := s.flushState.statesByTime[blockStart] + state.WarmStatus.DataFlushed = fileOpFailed + state.NumFailures++ + s.flushState.statesByTime[blockStart] = state + s.flushState.Unlock() +} + +// MarkWarmIndexFlushStateSuccessOrError marks the blockStart as +// success or fail based on the provided err. +func (s *dbShard) MarkWarmIndexFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) { + // Track flush state for block state + if err == nil { + s.markWarmIndexFlushStateSuccess(blockStart) + } else { + s.markWarmIndexFlushStateFail(blockStart) + } +} + +func (s *dbShard) markWarmIndexFlushStateSuccess(blockStart xtime.UnixNano) { + s.flushState.Lock() + state := s.flushState.statesByTime[blockStart] + state.WarmStatus.IndexFlushed = fileOpSuccess + s.flushState.statesByTime[blockStart] = state s.flushState.Unlock() } -func (s *dbShard) markWarmFlushStateFail(blockStart xtime.UnixNano) { +func (s *dbShard) markWarmIndexFlushStateFail(blockStart xtime.UnixNano) { s.flushState.Lock() state := s.flushState.statesByTime[blockStart] - state.WarmStatus = fileOpFailed + state.WarmStatus.IndexFlushed = fileOpFailed state.NumFailures++ s.flushState.statesByTime[blockStart] = state s.flushState.Unlock() @@ -2639,7 +2718,7 @@ func (s *dbShard) DocRef(id ident.ID) (doc.Metadata, bool, error) { s.RLock() defer s.RUnlock() - entry, _, err := s.lookupEntryWithLock(id) + entry, err := s.lookupEntryWithLock(id) if err == nil { return entry.Series.Metadata(), true, nil } @@ -2695,7 +2774,7 @@ func (s *dbShard) finishWriting( markWarmFlushStateSuccess bool, ) error { if markWarmFlushStateSuccess { - s.markWarmFlushStateSuccess(blockStart) + s.markWarmDataFlushStateSuccess(blockStart) } // After writing the full block successfully update the ColdVersionFlushed number. This will diff --git a/src/dbnode/storage/shard_foreachentry_prop_test.go b/src/dbnode/storage/shard_foreachentry_prop_test.go index b286eebb7e..bcbcb7863f 100644 --- a/src/dbnode/storage/shard_foreachentry_prop_test.go +++ b/src/dbnode/storage/shard_foreachentry_prop_test.go @@ -31,7 +31,6 @@ import ( "time" "github.com/m3db/m3/src/dbnode/namespace" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/x/context" "github.com/m3db/m3/src/x/ident" xtime "github.com/m3db/m3/src/x/time" @@ -205,7 +204,7 @@ func shardEntriesAreEqual(shard *dbShard, expectedEntries []shardEntryState) err return fmt.Errorf("expected to have %d idx, but did not see anything", idx) } nextElem := elem.Next() - entry := elem.Value.(*lookup.Entry) + entry := elem.Value.(*Entry) if !entry.Series.ID().Equal(expectedEntry.id) { return fmt.Errorf("expected id: %s at %d, observed: %s", expectedEntry.id.String(), idx, entry.Series.ID().String()) @@ -253,7 +252,7 @@ func genBatchWorkFn() gopter.Gen { return gen.UInt8(). Map(func(n uint8) dbShardEntryBatchWorkFn { i := uint8(0) - return func([]*lookup.Entry) bool { + return func([]*Entry) bool { i++ return i < n } diff --git a/src/dbnode/storage/shard_index_test.go b/src/dbnode/storage/shard_index_test.go index 1a358ec9e3..583dd1d7dc 100644 --- a/src/dbnode/storage/shard_index_test.go +++ b/src/dbnode/storage/shard_index_test.go @@ -114,6 +114,11 @@ func TestShardAsyncInsertMarkIndexedForBlockStart(t *testing.T) { now := xtime.Now() nextWriteTime := now.Truncate(blockSize) idx := NewMockNamespaceIndex(ctrl) + idx.EXPECT().BlockStartForWriteTime(gomock.Any()). + DoAndReturn(func(t xtime.UnixNano) xtime.UnixNano { + return t.Truncate(blockSize) + }). + AnyTimes() shard := testDatabaseShardWithIndexFn(t, opts, idx, false) shard.SetRuntimeOptions(runtime.NewOptions().SetWriteNewSeriesAsync(true)) defer shard.Close() @@ -135,7 +140,7 @@ func TestShardAsyncInsertMarkIndexedForBlockStart(t *testing.T) { start := time.Now() for time.Since(start) < 10*time.Second { - entry, _, err := shard.tryRetrieveWritableSeries(ident.StringID("foo")) + entry, _, err := shard.TryRetrieveSeriesAndIncrementReaderWriterCount(ident.StringID("foo")) require.NoError(t, err) if entry == nil { time.Sleep(10 * time.Millisecond) @@ -185,7 +190,7 @@ func TestShardAsyncIndexIfExpired(t *testing.T) { // make sure next block not marked as indexed start := time.Now() for time.Since(start) < 10*time.Second { - entry, _, err := shard.tryRetrieveWritableSeries(ident.StringID("foo")) + entry, _, err := shard.TryRetrieveSeriesAndIncrementReaderWriterCount(ident.StringID("foo")) require.NoError(t, err) if entry == nil { time.Sleep(10 * time.Millisecond) diff --git a/src/dbnode/storage/shard_insert_queue.go b/src/dbnode/storage/shard_insert_queue.go index 927f0868e8..6f93c05ade 100644 --- a/src/dbnode/storage/shard_insert_queue.go +++ b/src/dbnode/storage/shard_insert_queue.go @@ -29,7 +29,6 @@ import ( "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/runtime" "github.com/m3db/m3/src/dbnode/storage/series" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/x/checked" "github.com/m3db/m3/src/x/clock" @@ -335,7 +334,7 @@ type dbShardInsertsByCPUCore struct { } type dbShardInsert struct { - entry *lookup.Entry + entry *Entry opts dbShardInsertAsyncOptions } diff --git a/src/dbnode/storage/shard_insert_queue_test.go b/src/dbnode/storage/shard_insert_queue_test.go index 1190a80bb3..b11464c0ce 100644 --- a/src/dbnode/storage/shard_insert_queue_test.go +++ b/src/dbnode/storage/shard_insert_queue_test.go @@ -26,8 +26,6 @@ import ( "testing" "time" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" - "github.com/fortytw2/leaktest" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -90,16 +88,16 @@ func TestShardInsertQueueBatchBackoff(t *testing.T) { }() // first insert - _, err := q.Insert(dbShardInsert{entry: &lookup.Entry{Index: 0}}) + _, err := q.Insert(dbShardInsert{entry: &Entry{Index: 0}}) require.NoError(t, err) // wait for first insert batch to complete insertWgs[0].Wait() // now next batch will need to wait as we haven't progressed time - _, err = q.Insert(dbShardInsert{entry: &lookup.Entry{Index: 1}}) + _, err = q.Insert(dbShardInsert{entry: &Entry{Index: 1}}) require.NoError(t, err) - _, err = q.Insert(dbShardInsert{entry: &lookup.Entry{Index: 2}}) + _, err = q.Insert(dbShardInsert{entry: &Entry{Index: 2}}) require.NoError(t, err) // allow first insert to finish @@ -112,7 +110,7 @@ func TestShardInsertQueueBatchBackoff(t *testing.T) { assert.Equal(t, 1, numSleeps) // insert third batch, will also need to wait - _, err = q.Insert(dbShardInsert{entry: &lookup.Entry{Index: 3}}) + _, err = q.Insert(dbShardInsert{entry: &Entry{Index: 3}}) require.NoError(t, err) // allow second batch to finish diff --git a/src/dbnode/storage/shard_ref_count_test.go b/src/dbnode/storage/shard_ref_count_test.go index 74cbffac9d..f0bfb49edc 100644 --- a/src/dbnode/storage/shard_ref_count_test.go +++ b/src/dbnode/storage/shard_ref_count_test.go @@ -90,7 +90,7 @@ func testShardWriteSyncRefCount(t *testing.T, opts Options) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -114,7 +114,7 @@ func testShardWriteSyncRefCount(t *testing.T, opts Options) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -213,7 +213,7 @@ func testShardWriteTaggedSyncRefCount(t *testing.T, idx NamespaceIndex) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -240,7 +240,7 @@ func testShardWriteTaggedSyncRefCount(t *testing.T, idx NamespaceIndex) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -293,7 +293,7 @@ func TestShardWriteAsyncRefCount(t *testing.T) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -317,7 +317,7 @@ func TestShardWriteAsyncRefCount(t *testing.T) { // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -459,7 +459,7 @@ func testShardWriteTaggedAsyncRefCount(t *testing.T, idx NamespaceIndex, nowFn f // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) @@ -486,7 +486,7 @@ func testShardWriteTaggedAsyncRefCount(t *testing.T, idx NamespaceIndex, nowFn f // ensure all entries have no references left for _, id := range []string{"foo", "bar", "baz"} { shard.Lock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID(id)) + entry, err := shard.lookupEntryWithLock(ident.StringID(id)) shard.Unlock() assert.NoError(t, err) assert.Equal(t, int32(0), entry.ReaderWriterCount(), id) diff --git a/src/dbnode/storage/shard_test.go b/src/dbnode/storage/shard_test.go index 93ab49f37b..2a7c1cd394 100644 --- a/src/dbnode/storage/shard_test.go +++ b/src/dbnode/storage/shard_test.go @@ -42,7 +42,6 @@ import ( "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" "github.com/m3db/m3/src/dbnode/storage/index/convert" "github.com/m3db/m3/src/dbnode/storage/series" - "github.com/m3db/m3/src/dbnode/storage/series/lookup" "github.com/m3db/m3/src/dbnode/ts" xmetrics "github.com/m3db/m3/src/dbnode/x/metrics" "github.com/m3db/m3/src/dbnode/x/xio" @@ -100,7 +99,7 @@ func addMockSeries(ctrl *gomock.Controller, shard *dbShard, id ident.ID, tags id series.EXPECT().ID().Return(id).AnyTimes() series.EXPECT().IsEmpty().Return(false).AnyTimes() shard.Lock() - shard.insertNewShardEntryWithLock(lookup.NewEntry(lookup.NewEntryOptions{ + shard.insertNewShardEntryWithLock(NewEntry(NewEntryOptions{ Series: series, Index: index, })) @@ -177,7 +176,9 @@ func TestShardFlushStateNotStarted(t *testing.T) { nsCtx := namespace.Context{ID: ident.StringID("foo")} s.Bootstrap(ctx, nsCtx) - notStarted := fileOpState{WarmStatus: fileOpNotStarted} + notStarted := fileOpState{WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + }} for st := earliest; !st.After(latest); st = st.Add(ropts.BlockSize()) { flushState, err := s.FlushState(earliest) require.NoError(t, err) @@ -215,7 +216,7 @@ func TestShardBootstrapWithFlushVersion(t *testing.T) { // Load the mock into the shard as an expected series so that we can assert // on the call to its Bootstrap() method below. - entry := lookup.NewEntry(lookup.NewEntryOptions{ + entry := NewEntry(NewEntryOptions{ Series: mockSeries, }) s.Lock() @@ -431,8 +432,10 @@ func TestShardFlushSeriesFlushError(t *testing.T) { s.Bootstrap(ctx, nsCtx) s.flushState.statesByTime[blockStart] = fileOpState{ - WarmStatus: fileOpFailed, - NumFailures: 1, + WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + }, + NumFailures: 0, } var closed bool @@ -464,12 +467,12 @@ func TestShardFlushSeriesFlushError(t *testing.T) { flushed[i] = struct{}{} }). Return(series.FlushOutcomeErr, expectedErr) - s.list.PushBack(lookup.NewEntry(lookup.NewEntryOptions{ + s.list.PushBack(NewEntry(NewEntryOptions{ Series: curr, })) } - err := s.WarmFlush(blockStart, flush, namespace.Context{}) + flushErr := s.WarmFlush(blockStart, flush, namespace.Context{}) require.Equal(t, len(flushed), 2) for i := 0; i < 2; i++ { @@ -478,14 +481,16 @@ func TestShardFlushSeriesFlushError(t *testing.T) { } require.True(t, closed) - require.NotNil(t, err) - require.Equal(t, "error bar", err.Error()) + require.NotNil(t, flushErr) + require.Equal(t, "error bar", flushErr.Error()) flushState, err := s.FlushState(blockStart) require.NoError(t, err) require.Equal(t, fileOpState{ - WarmStatus: fileOpFailed, - NumFailures: 2, + WarmStatus: warmStatus{ + DataFlushed: fileOpFailed, + }, + NumFailures: 1, }, flushState) } @@ -511,8 +516,10 @@ func TestShardFlushSeriesFlushSuccess(t *testing.T) { s.Bootstrap(ctx, nsCtx) s.flushState.statesByTime[blockStart] = fileOpState{ - WarmStatus: fileOpFailed, - NumFailures: 1, + WarmStatus: warmStatus{ + DataFlushed: fileOpNotStarted, + }, + NumFailures: 0, } var closed bool @@ -541,7 +548,7 @@ func TestShardFlushSeriesFlushSuccess(t *testing.T) { flushed[i] = struct{}{} }). Return(series.FlushOutcomeFlushedToDisk, nil) - s.list.PushBack(lookup.NewEntry(lookup.NewEntryOptions{ + s.list.PushBack(NewEntry(NewEntryOptions{ Series: curr, })) } @@ -557,10 +564,13 @@ func TestShardFlushSeriesFlushSuccess(t *testing.T) { require.True(t, closed) require.Nil(t, err) + // State not yet updated since an explicit call to MarkWarmFlushStateSuccessOrError is required. flushState, err := s.FlushState(blockStart) require.NoError(t, err) require.Equal(t, fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, ColdVersionRetrievable: 0, NumFailures: 0, }, flushState) @@ -621,13 +631,13 @@ func TestShardColdFlush(t *testing.T) { // happen after a successful warm flush because warm flushes currently don't // have merging logic. This means that all blocks except t7 should // successfully cold flush. - shard.markWarmFlushStateSuccess(t0) - shard.markWarmFlushStateSuccess(t1) - shard.markWarmFlushStateSuccess(t2) - shard.markWarmFlushStateSuccess(t3) - shard.markWarmFlushStateSuccess(t4) - shard.markWarmFlushStateSuccess(t5) - shard.markWarmFlushStateSuccess(t6) + shard.markWarmDataFlushStateSuccess(t0) + shard.markWarmDataFlushStateSuccess(t1) + shard.markWarmDataFlushStateSuccess(t2) + shard.markWarmDataFlushStateSuccess(t3) + shard.markWarmDataFlushStateSuccess(t4) + shard.markWarmDataFlushStateSuccess(t5) + shard.markWarmDataFlushStateSuccess(t6) dirtyData := []testDirtySeries{ {id: ident.StringID("id0"), dirtyTimes: []xtime.UnixNano{t0, t2, t3, t4}}, @@ -641,7 +651,7 @@ func TestShardColdFlush(t *testing.T) { curr.EXPECT().Metadata().Return(doc.Metadata{ID: ds.id.Bytes()}).AnyTimes() curr.EXPECT().ColdFlushBlockStarts(gomock.Any()). Return(optimizedTimesFromTimes(ds.dirtyTimes)) - shard.list.PushBack(lookup.NewEntry(lookup.NewEntryOptions{ + shard.list.PushBack(NewEntry(NewEntryOptions{ Series: curr, })) } @@ -702,10 +712,10 @@ func TestShardColdFlushNoMergeIfNothingDirty(t *testing.T) { t1 := t0.Add(1 * blockSize) t2 := t0.Add(2 * blockSize) t3 := t0.Add(3 * blockSize) - shard.markWarmFlushStateSuccess(t0) - shard.markWarmFlushStateSuccess(t1) - shard.markWarmFlushStateSuccess(t2) - shard.markWarmFlushStateSuccess(t3) + shard.markWarmDataFlushStateSuccess(t0) + shard.markWarmDataFlushStateSuccess(t1) + shard.markWarmDataFlushStateSuccess(t2) + shard.markWarmDataFlushStateSuccess(t3) preparer := persist.NewMockFlushPreparer(ctrl) fsReader := fs.NewMockDataFileSetReader(ctrl) @@ -845,7 +855,7 @@ func TestShardSnapshotSeriesSnapshotSuccess(t *testing.T) { snapshotted[i] = struct{}{} }). Return(series.SnapshotResult{}, nil) - s.list.PushBack(lookup.NewEntry(lookup.NewEntryOptions{ + s.list.PushBack(NewEntry(NewEntryOptions{ Series: entry, })) } @@ -865,7 +875,7 @@ func addMockTestSeries(ctrl *gomock.Controller, shard *dbShard, id ident.ID) *se series := series.NewMockDatabaseSeries(ctrl) series.EXPECT().ID().AnyTimes().Return(id) shard.Lock() - shard.insertNewShardEntryWithLock(lookup.NewEntry(lookup.NewEntryOptions{ + shard.insertNewShardEntryWithLock(NewEntry(NewEntryOptions{ Series: series, })) shard.Unlock() @@ -883,7 +893,7 @@ func addTestSeriesWithCount(shard *dbShard, id ident.ID, count int32) series.Dat Options: shard.seriesOpts, }) shard.Lock() - entry := lookup.NewEntry(lookup.NewEntryOptions{ + entry := NewEntry(NewEntryOptions{ Series: seriesEntry, }) for i := int32(0); i < count; i++ { @@ -971,10 +981,14 @@ func TestShardTick(t *testing.T) { // Also check that it expires flush states by time shard.flushState.statesByTime[earliestFlush] = fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, } shard.flushState.statesByTime[beforeEarliestFlush] = fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, } assert.Equal(t, 2, len(shard.flushState.statesByTime)) @@ -1142,10 +1156,14 @@ func testShardWriteAsync(t *testing.T, writes []testWrite) { // Also check that it expires flush states by time shard.flushState.statesByTime[earliestFlush] = fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, } shard.flushState.statesByTime[beforeEarliestFlush] = fileOpState{ - WarmStatus: fileOpSuccess, + WarmStatus: warmStatus{ + DataFlushed: fileOpSuccess, + }, } assert.Equal(t, 2, len(shard.flushState.statesByTime)) @@ -1455,7 +1473,7 @@ func TestPurgeExpiredSeriesWriteAfterPurging(t *testing.T) { ctrl := xtest.NewController(t) defer ctrl.Finish() - var entry *lookup.Entry + var entry *Entry opts := DefaultTestOptions() shard := testDatabaseShard(t, opts) @@ -1488,7 +1506,7 @@ func TestForEachShardEntry(t *testing.T) { } count := 0 - entryFn := func(entry *lookup.Entry) bool { + entryFn := func(entry *Entry) bool { if entry.Series.ID().String() == "foo.8" { return false } @@ -1508,7 +1526,7 @@ func TestForEachShardEntry(t *testing.T) { // Ensure that reader writer count gets reset shard.RLock() for elem := shard.list.Front(); elem != nil; elem = elem.Next() { - entry := elem.Value.(*lookup.Entry) + entry := elem.Value.(*Entry) assert.Equal(t, int32(0), entry.ReaderWriterCount()) } shard.RUnlock() @@ -1639,8 +1657,8 @@ func TestShardFetchIndexChecksum(t *testing.T) { ropts := shard.seriesOpts.RetentionOptions() end := xtime.ToUnixNano(opts.ClockOptions().NowFn()()).Truncate(ropts.BlockSize()) start := end.Add(-2 * ropts.BlockSize()) - shard.markWarmFlushStateSuccess(start) - shard.markWarmFlushStateSuccess(start.Add(ropts.BlockSize())) + shard.markWarmDataFlushStateSuccess(start) + shard.markWarmDataFlushStateSuccess(start.Add(ropts.BlockSize())) retriever := block.NewMockDatabaseBlockRetriever(ctrl) shard.setBlockRetriever(retriever) @@ -1678,7 +1696,7 @@ func TestShardFetchIndexChecksum(t *testing.T) { time.Sleep(time.Second) shard.RLock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID("foo")) + entry, err := shard.lookupEntryWithLock(ident.StringID("foo")) shard.RUnlock() require.Equal(t, err, errShardEntryNotFound) @@ -1713,8 +1731,8 @@ func TestShardReadEncodedCachesSeriesWithRecentlyReadPolicy(t *testing.T) { ropts := shard.seriesOpts.RetentionOptions() end := xtime.ToUnixNano(opts.ClockOptions().NowFn()()).Truncate(ropts.BlockSize()) start := end.Add(-2 * ropts.BlockSize()) - shard.markWarmFlushStateSuccess(start) - shard.markWarmFlushStateSuccess(start.Add(ropts.BlockSize())) + shard.markWarmDataFlushStateSuccess(start) + shard.markWarmDataFlushStateSuccess(start.Add(ropts.BlockSize())) retriever := block.NewMockDatabaseBlockRetriever(ctrl) shard.setBlockRetriever(retriever) @@ -1772,7 +1790,7 @@ func TestShardReadEncodedCachesSeriesWithRecentlyReadPolicy(t *testing.T) { begin := time.Now() for time.Since(begin) < 10*time.Second { shard.RLock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID("foo")) + entry, err := shard.lookupEntryWithLock(ident.StringID("foo")) shard.RUnlock() if err == errShardEntryNotFound { time.Sleep(5 * time.Millisecond) @@ -1786,7 +1804,7 @@ func TestShardReadEncodedCachesSeriesWithRecentlyReadPolicy(t *testing.T) { } shard.RLock() - entry, _, err := shard.lookupEntryWithLock(ident.StringID("foo")) + entry, err := shard.lookupEntryWithLock(ident.StringID("foo")) shard.RUnlock() require.NoError(t, err) require.NotNil(t, entry) @@ -1866,7 +1884,7 @@ func TestShardNewEntryDoesNotAlterIDOrTags(t *testing.T) { shard.insertNewShardEntryWithLock(entry) shard.Unlock() - entry, _, err = shard.tryRetrieveWritableSeries(seriesID) + entry, _, err = shard.TryRetrieveSeriesAndIncrementReaderWriterCount(seriesID) require.NoError(t, err) entryIDBytes := entry.Series.ID().Bytes() @@ -2002,7 +2020,7 @@ func TestSeriesRefResolver(t *testing.T) { // should return already inserted entry as series. resolverEntry, err := shard.SeriesRefResolver(seriesID, iter) require.NoError(t, err) - require.IsType(t, &lookup.Entry{}, resolverEntry) + require.IsType(t, &Entry{}, resolverEntry) refEntry, err := resolverEntry.SeriesRef() require.NoError(t, err) require.Equal(t, seriesRef, refEntry) diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go index 256e576658..d1cd6433b5 100644 --- a/src/dbnode/storage/storage_mock.go +++ b/src/dbnode/storage/storage_mock.go @@ -1949,6 +1949,22 @@ func (mr *MockShardMockRecorder) OpenStreamingReader(blockStart interface{}) *go return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OpenStreamingReader", reflect.TypeOf((*MockShard)(nil).OpenStreamingReader), blockStart) } +// TryRetrieveSeriesAndIncrementReaderWriterCount mocks base method. +func (m *MockShard) TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) (*Entry, WritableSeriesOptions, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "TryRetrieveSeriesAndIncrementReaderWriterCount", id) + ret0, _ := ret[0].(*Entry) + ret1, _ := ret[1].(WritableSeriesOptions) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// TryRetrieveSeriesAndIncrementReaderWriterCount indicates an expected call of TryRetrieveSeriesAndIncrementReaderWriterCount. +func (mr *MockShardMockRecorder) TryRetrieveSeriesAndIncrementReaderWriterCount(id interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryRetrieveSeriesAndIncrementReaderWriterCount", reflect.TypeOf((*MockShard)(nil).TryRetrieveSeriesAndIncrementReaderWriterCount), id) +} + // MockdatabaseShard is a mock of databaseShard interface. type MockdatabaseShard struct { ctrl *gomock.Controller @@ -2221,6 +2237,18 @@ func (mr *MockdatabaseShardMockRecorder) LoadBlocks(series interface{}) *gomock. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadBlocks", reflect.TypeOf((*MockdatabaseShard)(nil).LoadBlocks), series) } +// MarkWarmIndexFlushStateSuccessOrError mocks base method. +func (m *MockdatabaseShard) MarkWarmIndexFlushStateSuccessOrError(blockStart time0.UnixNano, err error) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "MarkWarmIndexFlushStateSuccessOrError", blockStart, err) +} + +// MarkWarmIndexFlushStateSuccessOrError indicates an expected call of MarkWarmIndexFlushStateSuccessOrError. +func (mr *MockdatabaseShardMockRecorder) MarkWarmIndexFlushStateSuccessOrError(blockStart, err interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MarkWarmIndexFlushStateSuccessOrError", reflect.TypeOf((*MockdatabaseShard)(nil).MarkWarmIndexFlushStateSuccessOrError), blockStart, err) +} + // NumSeries mocks base method. func (m *MockdatabaseShard) NumSeries() int64 { m.ctrl.T.Helper() @@ -2351,6 +2379,22 @@ func (mr *MockdatabaseShardMockRecorder) Tick(c, startTime, nsCtx interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Tick", reflect.TypeOf((*MockdatabaseShard)(nil).Tick), c, startTime, nsCtx) } +// TryRetrieveSeriesAndIncrementReaderWriterCount mocks base method. +func (m *MockdatabaseShard) TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) (*Entry, WritableSeriesOptions, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "TryRetrieveSeriesAndIncrementReaderWriterCount", id) + ret0, _ := ret[0].(*Entry) + ret1, _ := ret[1].(WritableSeriesOptions) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// TryRetrieveSeriesAndIncrementReaderWriterCount indicates an expected call of TryRetrieveSeriesAndIncrementReaderWriterCount. +func (mr *MockdatabaseShardMockRecorder) TryRetrieveSeriesAndIncrementReaderWriterCount(id interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryRetrieveSeriesAndIncrementReaderWriterCount", reflect.TypeOf((*MockdatabaseShard)(nil).TryRetrieveSeriesAndIncrementReaderWriterCount), id) +} + // UpdateFlushStates mocks base method. func (m *MockdatabaseShard) UpdateFlushStates() { m.ctrl.T.Helper() @@ -2494,6 +2538,18 @@ func (mr *MockNamespaceIndexMockRecorder) AssignShardSet(shardSet interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AssignShardSet", reflect.TypeOf((*MockNamespaceIndex)(nil).AssignShardSet), shardSet) } +// BackgroundCompact mocks base method. +func (m *MockNamespaceIndex) BackgroundCompact() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "BackgroundCompact") +} + +// BackgroundCompact indicates an expected call of BackgroundCompact. +func (mr *MockNamespaceIndexMockRecorder) BackgroundCompact() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BackgroundCompact", reflect.TypeOf((*MockNamespaceIndex)(nil).BackgroundCompact)) +} + // BlockForBlockStart mocks base method. func (m *MockNamespaceIndex) BlockForBlockStart(blockStart time0.UnixNano) (index.Block, error) { m.ctrl.T.Helper() @@ -2680,6 +2736,20 @@ func (mr *MockNamespaceIndexMockRecorder) WarmFlush(flush, shards interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WarmFlush", reflect.TypeOf((*MockNamespaceIndex)(nil).WarmFlush), flush, shards) } +// WarmFlushBlockStarts mocks base method. +func (m *MockNamespaceIndex) WarmFlushBlockStarts() []time0.UnixNano { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WarmFlushBlockStarts") + ret0, _ := ret[0].([]time0.UnixNano) + return ret0 +} + +// WarmFlushBlockStarts indicates an expected call of WarmFlushBlockStarts. +func (mr *MockNamespaceIndexMockRecorder) WarmFlushBlockStarts() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WarmFlushBlockStarts", reflect.TypeOf((*MockNamespaceIndex)(nil).WarmFlushBlockStarts)) +} + // WideQuery mocks base method. func (m *MockNamespaceIndex) WideQuery(ctx context.Context, query index.Query, collector chan *ident.IDBatch, opts index.WideQueryOptions) error { m.ctrl.T.Helper() diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go index 61317be6e4..3a7340a9a5 100644 --- a/src/dbnode/storage/types.go +++ b/src/dbnode/storage/types.go @@ -502,6 +502,11 @@ type Shard interface { // OpenStreamingDataReader creates and opens a streaming fs.DataFileSetReader // on the latest volume of the given block. OpenStreamingReader(blockStart xtime.UnixNano) (fs.DataFileSetReader, error) + + // TryRetrieveSeriesAndIncrementReaderWriterCount attempts to retrieve a writable series. + // This increments the reader/writer count and so should be decremented when the series + // is no longer held. + TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) (*Entry, WritableSeriesOptions, error) } type databaseShard interface { @@ -611,6 +616,10 @@ type databaseShard interface { nsCtx namespace.Context, ) error + // MarkWarmIndexFlushStateSuccessOrError marks the blockStart as + // success or fail based on the provided err. + MarkWarmIndexFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) + // ColdFlush flushes the unflushed ColdWrites in this shard. ColdFlush( flush persist.FlushPreparer, @@ -760,6 +769,9 @@ type NamespaceIndex interface { shards []databaseShard, ) error + // WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk. + WarmFlushBlockStarts() []xtime.UnixNano + // ColdFlush performs any cold flushes that the index has outstanding using // the owned shards of the database. Also returns a callback to be called when // cold flushing completes to perform houskeeping. @@ -768,6 +780,9 @@ type NamespaceIndex interface { // DebugMemorySegments allows for debugging memory segments. DebugMemorySegments(opts DebugMemorySegmentsOptions) error + // BackgroundCompact background compacts eligible segments. + BackgroundCompact() + // Close will release the index resources and close the index. Close() error } diff --git a/src/m3ninx/doc/doc_mock.go b/src/m3ninx/doc/doc_mock.go index a44db060d3..f085b4492e 100644 --- a/src/m3ninx/doc/doc_mock.go +++ b/src/m3ninx/doc/doc_mock.go @@ -27,6 +27,8 @@ package doc import ( "reflect" + "github.com/m3db/m3/src/x/time" + "github.com/golang/mock/gomock" ) @@ -280,3 +282,132 @@ func (mr *MockQueryDocIteratorMockRecorder) Next() *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Next", reflect.TypeOf((*MockQueryDocIterator)(nil).Next)) } + +// MockOnIndexSeries is a mock of OnIndexSeries interface. +type MockOnIndexSeries struct { + ctrl *gomock.Controller + recorder *MockOnIndexSeriesMockRecorder +} + +// MockOnIndexSeriesMockRecorder is the mock recorder for MockOnIndexSeries. +type MockOnIndexSeriesMockRecorder struct { + mock *MockOnIndexSeries +} + +// NewMockOnIndexSeries creates a new mock instance. +func NewMockOnIndexSeries(ctrl *gomock.Controller) *MockOnIndexSeries { + mock := &MockOnIndexSeries{ctrl: ctrl} + mock.recorder = &MockOnIndexSeriesMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockOnIndexSeries) EXPECT() *MockOnIndexSeriesMockRecorder { + return m.recorder +} + +// IfAlreadyIndexedMarkIndexSuccessAndFinalize mocks base method. +func (m *MockOnIndexSeries) IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart time.UnixNano) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IfAlreadyIndexedMarkIndexSuccessAndFinalize", blockStart) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IfAlreadyIndexedMarkIndexSuccessAndFinalize indicates an expected call of IfAlreadyIndexedMarkIndexSuccessAndFinalize. +func (mr *MockOnIndexSeriesMockRecorder) IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IfAlreadyIndexedMarkIndexSuccessAndFinalize", reflect.TypeOf((*MockOnIndexSeries)(nil).IfAlreadyIndexedMarkIndexSuccessAndFinalize), blockStart) +} + +// IndexedForBlockStart mocks base method. +func (m *MockOnIndexSeries) IndexedForBlockStart(blockStart time.UnixNano) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IndexedForBlockStart", blockStart) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IndexedForBlockStart indicates an expected call of IndexedForBlockStart. +func (mr *MockOnIndexSeriesMockRecorder) IndexedForBlockStart(blockStart interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IndexedForBlockStart", reflect.TypeOf((*MockOnIndexSeries)(nil).IndexedForBlockStart), blockStart) +} + +// NeedsIndexGarbageCollected mocks base method. +func (m *MockOnIndexSeries) NeedsIndexGarbageCollected() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "NeedsIndexGarbageCollected") + ret0, _ := ret[0].(bool) + return ret0 +} + +// NeedsIndexGarbageCollected indicates an expected call of NeedsIndexGarbageCollected. +func (mr *MockOnIndexSeriesMockRecorder) NeedsIndexGarbageCollected() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NeedsIndexGarbageCollected", reflect.TypeOf((*MockOnIndexSeries)(nil).NeedsIndexGarbageCollected)) +} + +// NeedsIndexUpdate mocks base method. +func (m *MockOnIndexSeries) NeedsIndexUpdate(indexBlockStartForWrite time.UnixNano) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "NeedsIndexUpdate", indexBlockStartForWrite) + ret0, _ := ret[0].(bool) + return ret0 +} + +// NeedsIndexUpdate indicates an expected call of NeedsIndexUpdate. +func (mr *MockOnIndexSeriesMockRecorder) NeedsIndexUpdate(indexBlockStartForWrite interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NeedsIndexUpdate", reflect.TypeOf((*MockOnIndexSeries)(nil).NeedsIndexUpdate), indexBlockStartForWrite) +} + +// OnIndexFinalize mocks base method. +func (m *MockOnIndexSeries) OnIndexFinalize(blockStart time.UnixNano) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "OnIndexFinalize", blockStart) +} + +// OnIndexFinalize indicates an expected call of OnIndexFinalize. +func (mr *MockOnIndexSeriesMockRecorder) OnIndexFinalize(blockStart interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexFinalize", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexFinalize), blockStart) +} + +// OnIndexPrepare mocks base method. +func (m *MockOnIndexSeries) OnIndexPrepare(blockStart time.UnixNano) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "OnIndexPrepare", blockStart) +} + +// OnIndexPrepare indicates an expected call of OnIndexPrepare. +func (mr *MockOnIndexSeriesMockRecorder) OnIndexPrepare(blockStart interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexPrepare", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexPrepare), blockStart) +} + +// OnIndexSuccess mocks base method. +func (m *MockOnIndexSeries) OnIndexSuccess(blockStart time.UnixNano) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "OnIndexSuccess", blockStart) +} + +// OnIndexSuccess indicates an expected call of OnIndexSuccess. +func (mr *MockOnIndexSeriesMockRecorder) OnIndexSuccess(blockStart interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnIndexSuccess", reflect.TypeOf((*MockOnIndexSeries)(nil).OnIndexSuccess), blockStart) +} + +// TryMarkIndexGarbageCollected mocks base method. +func (m *MockOnIndexSeries) TryMarkIndexGarbageCollected() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "TryMarkIndexGarbageCollected") + ret0, _ := ret[0].(bool) + return ret0 +} + +// TryMarkIndexGarbageCollected indicates an expected call of TryMarkIndexGarbageCollected. +func (mr *MockOnIndexSeriesMockRecorder) TryMarkIndexGarbageCollected() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryMarkIndexGarbageCollected", reflect.TypeOf((*MockOnIndexSeries)(nil).TryMarkIndexGarbageCollected)) +} diff --git a/src/m3ninx/doc/document.go b/src/m3ninx/doc/document.go index 4eb6bc0f9c..741c10efec 100644 --- a/src/m3ninx/doc/document.go +++ b/src/m3ninx/doc/document.go @@ -89,8 +89,9 @@ func (f Fields) shallowCopy() Fields { // Metadata represents a document to be indexed. type Metadata struct { - ID []byte - Fields []Field + ID []byte + Fields []Field + OnIndexSeries OnIndexSeries } // Get returns the value of the specified field name in the document if it exists. diff --git a/src/m3ninx/doc/types.go b/src/m3ninx/doc/types.go index 879172719f..7cd03d2514 100644 --- a/src/m3ninx/doc/types.go +++ b/src/m3ninx/doc/types.go @@ -20,6 +20,10 @@ package doc +import ( + xtime "github.com/m3db/m3/src/x/time" +) + // MetadataIterator provides an iterator over a collection of document metadata. It is NOT // safe for multiple goroutines to invoke methods on an MetadataIterator simultaneously. type MetadataIterator interface { @@ -72,3 +76,52 @@ type QueryDocIterator interface { // worker. Done() bool } + +// OnIndexSeries provides a set of callback hooks to allow the reverse index +// to do lifecycle management of any resources retained during indexing. +type OnIndexSeries interface { + // OnIndexSuccess is executed when an entry is successfully indexed. The + // provided value for `blockStart` is the blockStart for which the write + // was indexed. + OnIndexSuccess(blockStart xtime.UnixNano) + + // OnIndexFinalize is executed when the index no longer holds any references + // to the provided resources. It can be used to cleanup any resources held + // during the course of indexing. `blockStart` is the startTime of the index + // block for which the write was attempted. + OnIndexFinalize(blockStart xtime.UnixNano) + + // OnIndexPrepare prepares the Entry to be handed off to the indexing sub-system. + // NB(prateek): we retain the ref count on the entry while the indexing is pending, + // the callback executed on the entry once the indexing is completed releases this + // reference. + OnIndexPrepare(blockStart xtime.UnixNano) + + // NeedsIndexUpdate returns a bool to indicate if the Entry needs to be indexed + // for the provided blockStart. It only allows a single index attempt at a time + // for a single entry. + // NB(prateek): NeedsIndexUpdate is a CAS, i.e. when this method returns true, it + // also sets state on the entry to indicate that a write for the given blockStart + // is going to be sent to the index, and other go routines should not attempt the + // same write. Callers are expected to ensure they follow this guideline. + // Further, every call to NeedsIndexUpdate which returns true needs to have a corresponding + // OnIndexFinalze() call. This is required for correct lifecycle maintenance. + NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) bool + + // IfAlreadyIndexedMarkIndexSuccessAndFinalize checks if the blockStart has been indexed. + // If indexed, it will be marked as such and finalized, and then return true. Otherwise false. + IfAlreadyIndexedMarkIndexSuccessAndFinalize( + blockStart xtime.UnixNano, + ) bool + + // TryMarkIndexGarbageCollected checks if the entry is eligible to be garbage collected + // from the index. If so, it marks the entry as GCed and returns true. Otherwise returns false. + TryMarkIndexGarbageCollected() bool + + // NeedsIndexGarbageCollected returns if the entry is eligible to be garbage collected + // from the index. + NeedsIndexGarbageCollected() bool + + // IndexedForBlockStart returns true if the blockStart has been indexed. + IndexedForBlockStart(blockStart xtime.UnixNano) bool +} diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go index 975f267d91..bba9c5cfda 100644 --- a/src/m3ninx/index/segment/builder/multi_segments_builder.go +++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go @@ -21,9 +21,12 @@ package builder import ( + "fmt" "io" "sort" + "github.com/uber-go/tally" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/index/segment" @@ -34,6 +37,8 @@ import ( type builderFromSegments struct { docs []doc.Metadata idSet *IDsMap + filter segment.DocumentsFilter + filterCount tally.Counter segments []segmentMetadata termsIter *termsIterFromSegments segmentsOffset postings.ID @@ -42,11 +47,15 @@ type builderFromSegments struct { type segmentMetadata struct { segment segment.Segment offset postings.ID - // duplicatesAsc is a lookup of document IDs are duplicates - // in this segment, that is documents that are already - // contained by other segments and hence should not be - // returned when looking up documents. - duplicatesAsc []postings.ID + // negativeOffsets is a lookup of document IDs are duplicates or should be skipped, + // that is documents that are already contained by other segments or should + // not be included in the output segment and hence should not be returned + // when looking up documents. If this is the case offset is -1. + // If a document ID is not a duplicate or skipped then the offset is + // the shift that should be applied when translating this postings ID + // to the result postings ID. + negativeOffsets []int64 + skips int64 } // NewBuilderFromSegments returns a new builder from segments. @@ -74,13 +83,21 @@ func (b *builderFromSegments) Reset() { b.segmentsOffset = 0 var emptySegment segmentMetadata for i := range b.segments { + // Save the offsets array. + negativeOffsets := b.segments[i].negativeOffsets b.segments[i] = emptySegment + b.segments[i].negativeOffsets = negativeOffsets[:0] } b.segments = b.segments[:0] b.termsIter.clear() } +func (b *builderFromSegments) SetFilter(filter segment.DocumentsFilter, filterCount tally.Counter) { + b.filter = filter + b.filterCount = filterCount +} + func (b *builderFromSegments) AddSegments(segments []segment.Segment) error { // Order by largest -> smallest so that the first segment // is the largest when iterating over term postings lists @@ -113,14 +130,35 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error { return err } + var negativeOffsets []int64 + if n := len(b.segments); cap(b.segments) > n { + // Take the offsets from the element we're about to reuse. + negativeOffsets = b.segments[:n+1][n].negativeOffsets[:0] + } + if int64(cap(negativeOffsets)) < segment.Size() { + negativeOffsets = make([]int64, 0, int(1.5*float64(segment.Size()))) + } + var ( added int - duplicates []postings.ID + currOffset int64 ) for iter.Next() { d := iter.Current() + negativeOffsets = append(negativeOffsets, currOffset) if b.idSet.Contains(d.ID) { - duplicates = append(duplicates, iter.PostingsID()) + // Skip duplicates. + negativeOffsets[len(negativeOffsets)-1] = -1 + currOffset++ + continue + } + if b.filter != nil && !b.filter.Contains(d) { + // Actively filtering and ID is not contained. + negativeOffsets[len(negativeOffsets)-1] = -1 + currOffset++ + if b.filterCount != nil { + b.filterCount.Inc(1) + } continue } b.idSet.SetUnsafe(d.ID, struct{}{}, IDsMapSetUnsafeOptions{ @@ -136,15 +174,11 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error { return err } - // Sort duplicates in ascending order - sort.Slice(duplicates, func(i, j int) bool { - return duplicates[i] < duplicates[j] - }) - b.segments = append(b.segments, segmentMetadata{ - segment: segment, - offset: b.segmentsOffset, - duplicatesAsc: duplicates, + segment: segment, + offset: b.segmentsOffset, + negativeOffsets: negativeOffsets, + skips: currOffset, }) b.segmentsOffset += postings.ID(added) } @@ -155,6 +189,25 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error { return nil } +func (b *builderFromSegments) SegmentMetadatas() ([]segment.SegmentsBuilderSegmentMetadata, error) { + n := len(b.segments) + if n < 1 { + return nil, fmt.Errorf("segments empty: length=%d", n) + } + + result := make([]segment.SegmentsBuilderSegmentMetadata, 0, n) + for _, s := range b.segments { + result = append(result, segment.SegmentsBuilderSegmentMetadata{ + Segment: s.segment, + Offset: s.offset, + NegativeOffsets: s.negativeOffsets, + Skips: s.skips, + }) + } + + return result, nil +} + func (b *builderFromSegments) Docs() []doc.Metadata { return b.docs } @@ -173,6 +226,10 @@ func (b *builderFromSegments) Metadata(id postings.ID) (doc.Metadata, error) { return b.docs[idx], nil } +func (b *builderFromSegments) NumDocs() (int, error) { + return len(b.docs), nil +} + func (b *builderFromSegments) FieldsIterable() segment.FieldsIterable { return b } diff --git a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go index 206be79fd3..71085550ba 100644 --- a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go +++ b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go @@ -41,14 +41,12 @@ type multiKeyPostingsListIterator struct { currIters []keyIterator currReaders []index.Reader currFieldPostingsList postings.MutableList - bitmapIter *bitmap.Iterator } func newMultiKeyPostingsListIterator() *multiKeyPostingsListIterator { b := bitmap.NewBitmapWithDefaultPooling(defaultBitmapContainerPooling) i := &multiKeyPostingsListIterator{ currFieldPostingsList: roaring.NewPostingsListFromBitmap(b), - bitmapIter: &bitmap.Iterator{}, } i.reset() return i @@ -149,45 +147,54 @@ func (i *multiKeyPostingsListIterator) Next() bool { return false } - if fieldsKeyIter.segment.offset == 0 { + if fieldsKeyIter.segment.offset == 0 && fieldsKeyIter.segment.skips == 0 { // No offset, which means is first segment we are combining from - // so can just direct union - i.currFieldPostingsList.Union(pl) + // so can just direct union. + // Make sure skips is empty otherwise we need to do filtering. + if err := i.currFieldPostingsList.Union(pl); err != nil { + i.err = err + return false + } continue } // We have to taken into account the offset and duplicates var ( - iter = i.bitmapIter - duplicates = fieldsKeyIter.segment.duplicatesAsc - negativeOffset postings.ID + iter = pl.Iterator() + negativeOffsets = fieldsKeyIter.segment.negativeOffsets + multiErr = xerrors.NewMultiError() ) - bitmap, ok := roaring.BitmapFromPostingsList(pl) - if !ok { - i.err = errPostingsListNotRoaring - return false - } - - iter.Reset(bitmap) - for v, eof := iter.Next(); !eof; v, eof = iter.Next() { - curr := postings.ID(v) - for len(duplicates) > 0 && curr > duplicates[0] { - duplicates = duplicates[1:] - negativeOffset++ - } - if len(duplicates) > 0 && curr == duplicates[0] { - duplicates = duplicates[1:] - negativeOffset++ - // Also skip this value, as itself is a duplicate + for iter.Next() { + curr := iter.Current() + negativeOffset := negativeOffsets[curr] + // Then skip the individual if matches. + if negativeOffset == -1 { + // Skip this value, as itself is a duplicate. continue } - value := curr + fieldsKeyIter.segment.offset - negativeOffset + value := curr + fieldsKeyIter.segment.offset - postings.ID(negativeOffset) if err := i.currFieldPostingsList.Insert(value); err != nil { - i.err = err + multiErr = multiErr.Add(err) + multiErr = multiErr.Add(iter.Close()) + i.err = multiErr.FinalError() return false } } + + multiErr = multiErr.Add(iter.Err()) + multiErr = multiErr.Add(iter.Close()) + i.err = multiErr.FinalError() + if i.err != nil { + return false + } + } + + if i.currFieldPostingsList.IsEmpty() { + // Everything skipped or term is empty. + // TODO: make this non-stack based (i.e. not recursive). + return i.Next() } + return true } diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go index d27eb3ec8d..859040d0f4 100644 --- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go +++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go @@ -21,8 +21,6 @@ package builder import ( - "errors" - "github.com/m3db/m3/src/m3ninx/index/segment" "github.com/m3db/m3/src/m3ninx/postings" "github.com/m3db/m3/src/m3ninx/postings/roaring" @@ -34,10 +32,6 @@ const ( defaultBitmapContainerPooling = 128 ) -var ( - errPostingsListNotRoaring = errors.New("postings list not a roaring postings list") -) - // Ensure for our use case that the terms iter from segments we return // matches the signature for the terms iterator. var _ segment.TermsIterator = &termsIterFromSegments{} @@ -141,44 +135,51 @@ func (i *termsIterFromSegments) Next() bool { termsKeyIter := iter.(*termsKeyIter) _, list := termsKeyIter.iter.Current() - if termsKeyIter.segment.offset == 0 { + if termsKeyIter.segment.offset == 0 && termsKeyIter.segment.skips == 0 { // No offset, which means is first segment we are combining from - // so can just direct union - i.currPostingsList.Union(list) + // so can just direct union. + if err := i.currPostingsList.Union(list); err != nil { + i.err = err + return false + } continue } - // We have to taken into account the offset and duplicates + // We have to take into account offset and duplicates/skips. var ( - iter = i.bitmapIter - duplicates = termsKeyIter.segment.duplicatesAsc - negativeOffset postings.ID + iter = list.Iterator() + negativeOffsets = termsKeyIter.segment.negativeOffsets + multiErr = xerrors.NewMultiError() ) - bitmap, ok := roaring.BitmapFromPostingsList(list) - if !ok { - i.err = errPostingsListNotRoaring - return false - } - - iter.Reset(bitmap) - for v, eof := iter.Next(); !eof; v, eof = iter.Next() { - curr := postings.ID(v) - for len(duplicates) > 0 && curr > duplicates[0] { - duplicates = duplicates[1:] - negativeOffset++ - } - if len(duplicates) > 0 && curr == duplicates[0] { - duplicates = duplicates[1:] - negativeOffset++ - // Also skip this value, as itself is a duplicate + for iter.Next() { + curr := iter.Current() + negativeOffset := negativeOffsets[curr] + // Then skip the individual if matches. + if negativeOffset == -1 { + // Skip this value, as itself is a duplicate. continue } - value := curr + termsKeyIter.segment.offset - negativeOffset + value := curr + termsKeyIter.segment.offset - postings.ID(negativeOffset) if err := i.currPostingsList.Insert(value); err != nil { - i.err = err + multiErr = multiErr.Add(err) + multiErr = multiErr.Add(iter.Close()) + i.err = multiErr.FinalError() return false } } + + multiErr = multiErr.Add(iter.Err()) + multiErr = multiErr.Add(iter.Close()) + i.err = multiErr.FinalError() + if i.err != nil { + return false + } + } + + if i.currPostingsList.IsEmpty() { + // Everything skipped or term is empty. + // TODO: make this non-stack based (i.e. not recursive). + return i.Next() } return true diff --git a/src/m3ninx/index/segment/segment_mock.go b/src/m3ninx/index/segment/segment_mock.go index c8f5b22217..37be204faa 100644 --- a/src/m3ninx/index/segment/segment_mock.go +++ b/src/m3ninx/index/segment/segment_mock.go @@ -32,6 +32,7 @@ import ( "github.com/m3db/m3/src/m3ninx/postings" "github.com/golang/mock/gomock" + "github.com/uber-go/tally" ) // MockSegment is a mock of Segment interface. @@ -1817,6 +1818,33 @@ func (mr *MockSegmentsBuilderMockRecorder) Reset() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reset", reflect.TypeOf((*MockSegmentsBuilder)(nil).Reset)) } +// SegmentMetadatas mocks base method. +func (m *MockSegmentsBuilder) SegmentMetadatas() ([]SegmentsBuilderSegmentMetadata, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentMetadatas") + ret0, _ := ret[0].([]SegmentsBuilderSegmentMetadata) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// SegmentMetadatas indicates an expected call of SegmentMetadatas. +func (mr *MockSegmentsBuilderMockRecorder) SegmentMetadatas() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadatas", reflect.TypeOf((*MockSegmentsBuilder)(nil).SegmentMetadatas)) +} + +// SetFilter mocks base method. +func (m *MockSegmentsBuilder) SetFilter(keep DocumentsFilter, filterCount tally.Counter) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "SetFilter", keep, filterCount) +} + +// SetFilter indicates an expected call of SetFilter. +func (mr *MockSegmentsBuilderMockRecorder) SetFilter(keep, filterCount interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetFilter", reflect.TypeOf((*MockSegmentsBuilder)(nil).SetFilter), keep, filterCount) +} + // Terms mocks base method. func (m *MockSegmentsBuilder) Terms(field []byte) (TermsIterator, error) { m.ctrl.T.Helper() @@ -1831,3 +1859,40 @@ func (mr *MockSegmentsBuilderMockRecorder) Terms(field interface{}) *gomock.Call mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Terms", reflect.TypeOf((*MockSegmentsBuilder)(nil).Terms), field) } + +// MockDocumentsFilter is a mock of DocumentsFilter interface. +type MockDocumentsFilter struct { + ctrl *gomock.Controller + recorder *MockDocumentsFilterMockRecorder +} + +// MockDocumentsFilterMockRecorder is the mock recorder for MockDocumentsFilter. +type MockDocumentsFilterMockRecorder struct { + mock *MockDocumentsFilter +} + +// NewMockDocumentsFilter creates a new mock instance. +func NewMockDocumentsFilter(ctrl *gomock.Controller) *MockDocumentsFilter { + mock := &MockDocumentsFilter{ctrl: ctrl} + mock.recorder = &MockDocumentsFilterMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockDocumentsFilter) EXPECT() *MockDocumentsFilterMockRecorder { + return m.recorder +} + +// Contains mocks base method. +func (m *MockDocumentsFilter) Contains(d doc.Metadata) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Contains", d) + ret0, _ := ret[0].(bool) + return ret0 +} + +// Contains indicates an expected call of Contains. +func (mr *MockDocumentsFilterMockRecorder) Contains(d interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Contains", reflect.TypeOf((*MockDocumentsFilter)(nil).Contains), d) +} diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go index 61ba1d3cd3..24cf76da35 100644 --- a/src/m3ninx/index/segment/types.go +++ b/src/m3ninx/index/segment/types.go @@ -23,6 +23,8 @@ package segment import ( "errors" + "github.com/uber-go/tally" + "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/postings" @@ -220,6 +222,44 @@ type CloseableDocumentsBuilder interface { type SegmentsBuilder interface { Builder + // SetFilter sets a filter on which documents to retain + // when building the segment. + SetFilter(keep DocumentsFilter, filterCount tally.Counter) + // AddSegments adds segments to build from. AddSegments(segments []Segment) error + + // SegmentMetadatas returns the segment builder segment metadata. + SegmentMetadatas() ([]SegmentsBuilderSegmentMetadata, error) +} + +// SegmentsBuilderSegmentMetadata is a set of metadata about a segment +// that was used to build a compacted segment. +type SegmentsBuilderSegmentMetadata struct { + Segment Segment + Offset postings.ID + // NegativeOffsets is a lookup of document IDs are duplicates or should be skipped, + // that is documents that are already contained by other segments or should + // not be included in the output segment and hence should not be returned + // when looking up documents. If this is the case offset is -1. + // If a document ID is not a duplicate or skipped then the offset is + // the shift that should be applied when translating this postings ID + // to the result postings ID. + NegativeOffsets []int64 + Skips int64 +} + +// DocumentsFilter is a documents filter. +type DocumentsFilter interface { + Contains(d doc.Metadata) bool +} + +// DocumentsFilterFn implements DocumentsFilter. +type DocumentsFilterFn func(d doc.Metadata) bool + +var _ DocumentsFilter = DocumentsFilterFn(nil) + +// Contains implements the DocumentsFilter interface. +func (f DocumentsFilterFn) Contains(d doc.Metadata) bool { + return f(d) } diff --git a/src/query/README.md b/src/query/README.md index 41546b0896..0c18a57677 100644 --- a/src/query/README.md +++ b/src/query/README.md @@ -77,4 +77,4 @@ Setup and run Prometheus: - url: http://10.142.0.8:7201/api/v1/prom/remote/write ``` 3. Run Prometheus - $ sudo docker run -p 9090:9090 -v $GOPATH/src/github.com/m3db/m3/src/query/docker/prometheus.yml:/etc/prometheus/prometheus.yml quay.io/prometheus/prometheus + $ sudo docker run -p 9090:9090 -v $GOPATH/src/github.com/m3db/m3/src/query/docker/prometheus.yml:/etc/prometheus/prometheus.yml quay.io/prometheus/prometheus \ No newline at end of file diff --git a/src/query/storage/m3/consolidators/convert.go b/src/query/storage/m3/consolidators/convert.go index 03eb39515b..d7f459dba3 100644 --- a/src/query/storage/m3/consolidators/convert.go +++ b/src/query/storage/m3/consolidators/convert.go @@ -45,3 +45,18 @@ func FromIdentTagIteratorToTags( return tags, nil } + +// MustIdentTagIteratorToTags converts ident tags to coordinator tags. +func MustIdentTagIteratorToTags( + identTags ident.TagIterator, + tagOptions models.TagOptions, +) models.Tags { + if tagOptions == nil { + tagOptions = models.NewTagOptions() + } + tags, err := FromIdentTagIteratorToTags(identTags, tagOptions) + if err != nil { + panic(err) + } + return tags +}