Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dbnode] Streaming writer #2618

Merged
merged 13 commits into from
Sep 18, 2020
9 changes: 7 additions & 2 deletions src/dbnode/persist/fs/read_write_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ import (
"testing"
"time"

"github.com/m3db/bloom/v4"
"github.com/m3db/m3/src/dbnode/digest"
"github.com/m3db/m3/src/dbnode/persist"
"github.com/m3db/m3/src/x/checked"
"github.com/m3db/m3/src/x/ident"
xtime "github.com/m3db/m3/src/x/time"

"github.com/m3db/bloom/v4"
"github.com/pborman/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -219,8 +219,13 @@ func readTestDataWithStreamingOpt(
assert.NoError(t, err)
// Make sure the bloom filter doesn't always return true
assert.False(t, bloomFilter.Test([]byte("some_random_data")))

expectedEntries := uint(len(entries))
if expectedEntries == 0 {
expectedEntries = 1
}
expectedM, expectedK := bloom.EstimateFalsePositiveRate(
uint(len(entries)), defaultIndexBloomFilterFalsePositivePercent)
expectedEntries, defaultIndexBloomFilterFalsePositivePercent)
assert.Equal(t, expectedK, bloomFilter.K())
// EstimateFalsePositiveRate always returns at least 1, so skip this check
// if len entries is 0
Expand Down
236 changes: 236 additions & 0 deletions src/dbnode/persist/fs/streaming_write.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
// Copyright (c) 2020 Uber Technologies, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

package fs

import (
"bytes"
"fmt"
"math"
"time"

"github.com/m3db/m3/src/dbnode/persist"
"github.com/m3db/m3/src/dbnode/ts"
"github.com/m3db/m3/src/x/ident"

"github.com/m3db/bloom/v4"
)

// StreamingWriter writes into data fileset without intermediate buffering.
// Writes must be lexicographically ordered by the id.
type StreamingWriter interface {
Open() error
WriteAll(id ident.BytesID, encodedTags ts.EncodedTags, data [][]byte, dataChecksum uint32) error
Close() error
}

// StreamingWriterOptions in the options for the StreamingWriter.
type StreamingWriterOptions struct {
Options Options
NamespaceID ident.ID
ShardID uint32
BlockStart time.Time
BlockSize time.Duration
VolumeIndex int
PlannedRecordsCount uint
}

type streamingWriter struct {
opts StreamingWriterOptions
writer *writer
writerOpts DataWriterOpenOptions
currIdx int64
prevIDBytes []byte
summaryEvery int64
bloomFilter *bloom.BloomFilter
indexOffset int64
summaries int
}

func NewStreamingWriter(opts StreamingWriterOptions) (StreamingWriter, error) {
w, err := NewWriter(opts.Options)
if err != nil {
return nil, err
}

writerOpts := DataWriterOpenOptions{
BlockSize: opts.BlockSize,
Identifier: FileSetFileIdentifier{
Namespace: opts.NamespaceID,
Shard: opts.ShardID,
BlockStart: opts.BlockStart,
VolumeIndex: opts.VolumeIndex,
},
FileSetType: persist.FileSetFlushType,
}

plannedRecordsCount := opts.PlannedRecordsCount
if plannedRecordsCount == 0 {
plannedRecordsCount = 1
}
m, k := bloom.EstimateFalsePositiveRate(
plannedRecordsCount,
opts.Options.IndexBloomFilterFalsePositivePercent(),
)
bloomFilter := bloom.NewBloomFilter(m, k)

summariesApprox := float64(opts.PlannedRecordsCount) * opts.Options.IndexSummariesPercent()
summaryEvery := 0
if summariesApprox > 0 {
summaryEvery = int(math.Floor(float64(opts.PlannedRecordsCount) / summariesApprox))
}

return &streamingWriter{
opts: opts,
writer: w.(*writer),
writerOpts: writerOpts,
summaryEvery: int64(summaryEvery),
bloomFilter: bloomFilter,
}, nil
}

func (w *streamingWriter) Open() error {
if err := w.writer.Open(w.writerOpts); err != nil {
return err
}
w.indexOffset = 0
w.summaries = 0
w.prevIDBytes = nil
return nil
}

func (w *streamingWriter) WriteAll(
id ident.BytesID,
encodedTags ts.EncodedTags,
data [][]byte,
dataChecksum uint32,
) error {
// Need to check if w.prevIDBytes != nil, otherwise we can never write an empty string ID
if w.prevIDBytes != nil && bytes.Compare(id, w.prevIDBytes) <= 0 {
return fmt.Errorf("ids must be written in lexicographic order, no duplicates, but got %s followed by %s", w.prevIDBytes, id)
}
w.prevIDBytes = append(w.prevIDBytes[:0], id...)

entry, ok, err := w.writeData(data, dataChecksum)
if err != nil {
return err
}

if ok {
return w.writeIndexRelated(id, encodedTags, entry)
}

return nil
}

func (w *streamingWriter) writeData(
data [][]byte,
dataChecksum uint32,
) (indexEntry, bool, error) {
var size int64
for _, d := range data {
size += int64(len(d))
}
if size == 0 {
return indexEntry{}, false, nil
}

entry := indexEntry{
index: w.currIdx,
dataFileOffset: w.writer.currOffset,
size: uint32(size),
dataChecksum: dataChecksum,
}
for _, d := range data {
if err := w.writer.writeData(d); err != nil {
return indexEntry{}, false, err
}
}

w.currIdx++

return entry, true, nil
}

func (w *streamingWriter) writeIndexRelated(
id ident.BytesID,
encodedTags []byte,
entry indexEntry,
) error {
// Add to the bloom filter, note this must be zero alloc or else this will
// cause heavy GC churn as we flush millions of series at end of each
// time window
w.bloomFilter.Add(id)

if entry.index%w.summaryEvery == 0 {
// Capture the offset for when we write this summary back, only capture
// for every summary we'll actually write to avoid a few memcopies
entry.indexFileOffset = w.indexOffset
}

length, err := w.writer.writeIndexWithEncodedTags(id, encodedTags, entry)
if err != nil {
return err
}
w.indexOffset += length

if entry.index%w.summaryEvery == 0 {
err = w.writer.writeSummariesEntry(id, entry)
if err != nil {
return err
}
w.summaries++
}

return nil
}

func (w *streamingWriter) Close() error {
w.prevIDBytes = nil

// Write the bloom filter bitset out
if err := w.writer.writeBloomFilterFileContents(w.bloomFilter); err != nil {
return err
}

if err := w.writer.writeInfoFileContents(w.bloomFilter, w.summaries, w.currIdx); err != nil {
return err
}

err := w.writer.closeWOIndex()
if err != nil {
w.writer.err = err
return err
}

// NB(xichen): only write out the checkpoint file if there are no errors
// encountered between calling writer.Open() and writer.Close().
if err := writeCheckpointFile(
w.writer.checkpointFilePath,
w.writer.digestFdWithDigestContents.Digest().Sum32(),
w.writer.digestBuf,
w.writer.newFileMode,
); err != nil {
w.writer.err = err
return err
}

return nil
}
Loading