From 31db35ed68103563d1eeb8f0e5ad6238380bd270 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Mon, 10 Jun 2019 12:26:12 -0400 Subject: [PATCH 01/12] Support for numbered filesets [wip] --- src/dbnode/persist/fs/files.go | 139 +++++++++--- src/dbnode/persist/fs/files_test.go | 72 +++++- src/dbnode/persist/fs/persist_manager.go | 14 +- src/dbnode/persist/fs/read.go | 37 ++-- src/dbnode/persist/fs/retriever.go | 11 +- src/dbnode/persist/fs/seek.go | 13 +- src/dbnode/persist/fs/seek_manager.go | 265 ++++++++++++++--------- src/dbnode/persist/fs/types.go | 21 +- src/dbnode/persist/fs/write.go | 39 ++-- src/dbnode/storage/namespace_readers.go | 14 +- src/dbnode/storage/shard.go | 4 +- 11 files changed, 424 insertions(+), 205 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index ca7abf4935..541d53238e 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -58,6 +58,10 @@ const ( snapshotDirName = "snapshots" commitLogsDirName = "commitlogs" + // Filesets that are not indexed should be ordered before 0-indexed fileset + // files. + unindexedFilesetIndex = -1 + commitLogComponentPosition = 2 indexFileSetComponentPosition = 2 @@ -174,7 +178,7 @@ func (f FileSetFilesSlice) Filepaths() []string { } // LatestVolumeForBlock returns the latest (highest index) FileSetFile in the -// slice for a given block start, only applicable for index and snapshot file set files. +// slice for a given block start; applicable for data, index and snapshot file set files. func (f FileSetFilesSlice) LatestVolumeForBlock(blockStart time.Time) (FileSetFile, bool) { // Make sure we're already sorted f.sortByTimeAndVolumeIndexAscending() @@ -357,9 +361,25 @@ func (a commitlogsByTimeAndIndexAscending) Less(i, j int) bool { return ti.Equal(tj) && ii < ij } -// fileSetFilesByTimeAndIndexAscending sorts file sets files by their block start times and volume -// index in ascending order. If the files do not have block start times or indexes in their names, -// the result is undefined. +// dataFileSetFilesByTimeAndVolumeIndexAscending sorts file sets files by their +// block start times and volume index in ascending order. If the files do not +// have block start times or indexes in their names, the result is undefined. +type dataFileSetFilesByTimeAndVolumeIndexAscending []string + +func (a dataFileSetFilesByTimeAndVolumeIndexAscending) Len() int { return len(a) } +func (a dataFileSetFilesByTimeAndVolumeIndexAscending) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a dataFileSetFilesByTimeAndVolumeIndexAscending) Less(i, j int) bool { + ti, ii, _ := TimeAndVolumeIndexFromDataFileSetFilename(a[i]) + tj, ij, _ := TimeAndVolumeIndexFromDataFileSetFilename(a[j]) + if ti.Before(tj) { + return true + } + return ti.Equal(tj) && ii < ij +} + +// fileSetFilesByTimeAndVolumeIndexAscending sorts file sets files by their +// block start times and volume index in ascending order. If the files do not +// have block start times or indexes in their names, the result is undefined. type fileSetFilesByTimeAndVolumeIndexAscending []string func (a fileSetFilesByTimeAndVolumeIndexAscending) Len() int { return len(a) } @@ -378,8 +398,8 @@ func componentsAndTimeFromFileName(fname string) ([]string, time.Time, error) { if len(components) < 3 { return nil, timeZero, fmt.Errorf("unexpected file name %s", fname) } - str := strings.Replace(components[1], fileSuffix, "", 1) - nanoseconds, err := strconv.ParseInt(str, 10, 64) + + nanoseconds, err := strconv.ParseInt(components[1], 10, 64) if err != nil { return nil, timeZero, err } @@ -392,12 +412,42 @@ func TimeFromFileName(fname string) (time.Time, error) { return t, err } -// TimeAndIndexFromCommitlogFilename extracts the block start and index from file name for a commitlog. +// TimeAndIndexFromCommitlogFilename extracts the block start and index from +// file name for a commitlog. func TimeAndIndexFromCommitlogFilename(fname string) (time.Time, int, error) { return timeAndIndexFromFileName(fname, commitLogComponentPosition) } -// TimeAndVolumeIndexFromFileSetFilename extracts the block start and volume index from file name. +// TimeAndVolumeIndexFromDataFileSetFilename extracts the block start and volume +// index from a data fileset file name that may or may not have an index. If the +// file name does not include an index, defaultFilesetIndex is returned as the +// volume index. +func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, error) { + components, t, err := componentsAndTimeFromFileName(fname) + if err != nil { + return timeZero, 0, err + } + + if len(components) == 3 { + // Three components mean that there is no index in the filename. + return t, unindexedFilesetIndex, err + } + + if len(components) == 4 { + // Four components mean that there is an index in the filename. + str := strings.Replace(components[indexFileSetComponentPosition], fileSuffix, "", 1) + index, err := strconv.ParseInt(str, 10, 64) + if err != nil { + return timeZero, 0, err + } + return t, int(index), nil + } + + return timeZero, 0, fmt.Errorf("malformed filename: %s", fname) +} + +// TimeAndVolumeIndexFromFileSetFilename extracts the block start and +// volume index from an index file name. func TimeAndVolumeIndexFromFileSetFilename(fname string) (time.Time, int, error) { return timeAndIndexFromFileName(fname, indexFileSetComponentPosition) } @@ -556,9 +606,9 @@ func forEachInfoFile( case persist.FileSetFlushType: switch args.contentType { case persist.FileSetDataContentType: - checkpointFilePath = filesetPathFromTime(dir, t, checkpointFileSuffix) - digestsFilePath = filesetPathFromTime(dir, t, digestFileSuffix) - infoFilePath = filesetPathFromTime(dir, t, infoFileSuffix) + checkpointFilePath = filesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix) + digestsFilePath = filesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix) + infoFilePath = filesetPathFromTimeAndIndex(dir, t, volume, infoFileSuffix) case persist.FileSetIndexContentType: checkpointFilePath = filesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix) digestsFilePath = filesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix) @@ -793,7 +843,20 @@ func SortedSnapshotMetadataFiles(opts Options) ( return metadatas, errorsWithPaths, nil } -// SnapshotFiles returns a slice of all the names for all the fileset files +// DataFiles returns a slice of all the names for all the fileset files +// for a given namespace and shard combination. +func DataFiles(filePathPrefix string, namespace ident.ID, shard uint32) (FileSetFilesSlice, error) { + return filesetFiles(filesetFilesSelector{ + fileSetType: persist.FileSetFlushType, + contentType: persist.FileSetDataContentType, + filePathPrefix: filePathPrefix, + namespace: namespace, + shard: shard, + pattern: filesetFilePattern, + }) +} + +// SnapshotFiles returns a slice of all the names for all the snapshot files // for a given namespace and shard combination. func SnapshotFiles(filePathPrefix string, namespace ident.ID, shard uint32) (FileSetFilesSlice, error) { return filesetFiles(filesetFilesSelector{ @@ -818,8 +881,8 @@ func IndexSnapshotFiles(filePathPrefix string, namespace ident.ID) (FileSetFiles }) } -// FileSetAt returns a FileSetFile for the given namespace/shard/blockStart combination if it exists. -func FileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time) (FileSetFile, bool, error) { +// FileSetAt returns a FileSetFile for the given namespace/shard/blockStart/volume combination if it exists. +func FileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time, volume int) (FileSetFile, bool, error) { matched, err := filesetFiles(filesetFilesSelector{ fileSetType: persist.FileSetFlushType, contentType: persist.FileSetDataContentType, @@ -832,18 +895,9 @@ func FileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockSta return FileSetFile{}, false, err } - matched.sortByTimeAscending() + matched.sortByTimeAndVolumeIndexAscending() for i, fileset := range matched { if fileset.ID.BlockStart.Equal(blockStart) { - nextIdx := i + 1 - if nextIdx < len(matched) && matched[nextIdx].ID.BlockStart.Equal(blockStart) { - // Should never happen - return FileSetFile{}, false, fmt.Errorf( - "found multiple fileset files for blockStart: %d", - blockStart.Unix(), - ) - } - if !fileset.HasCompleteCheckpointFile() { continue } @@ -883,14 +937,14 @@ func IndexFileSetsAt(filePathPrefix string, namespace ident.ID, blockStart time. return filesets, nil } -// DeleteFileSetAt deletes a FileSetFile for a given namespace/shard/blockStart combination if it exists. -func DeleteFileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, t time.Time) error { - fileset, ok, err := FileSetAt(filePathPrefix, namespace, shard, t) +// DeleteFileSetAt deletes a FileSetFile for a given namespace/shard/blockStart/volume combination if it exists. +func DeleteFileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time, volume int) error { + fileset, ok, err := FileSetAt(filePathPrefix, namespace, shard, blockStart, volume) if err != nil { return err } if !ok { - return fmt.Errorf("fileset for blockStart: %d does not exist", t.Unix()) + return fmt.Errorf("fileset for blockStart: %d does not exist", blockStart.Unix()) } return DeleteFiles(fileset.AbsoluteFilepaths) @@ -1011,7 +1065,7 @@ func filesetFiles(args filesetFilesSelector) (FileSetFilesSlice, error) { case persist.FileSetDataContentType: dir := ShardDataDirPath(args.filePathPrefix, args.namespace, args.shard) byTimeAsc, err = findFiles(dir, args.pattern, func(files []string) sort.Interface { - return byTimeAscending(files) + return dataFileSetFilesByTimeAndVolumeIndexAscending(files) }) case persist.FileSetIndexContentType: dir := NamespaceIndexDataDirPath(args.filePathPrefix, args.namespace) @@ -1061,7 +1115,7 @@ func filesetFiles(args filesetFilesSelector) (FileSetFilesSlice, error) { case persist.FileSetFlushType: switch args.contentType { case persist.FileSetDataContentType: - currentFileBlockStart, err = TimeFromFileName(file) + currentFileBlockStart, volumeIndex, err = TimeAndVolumeIndexFromDataFileSetFilename(file) case persist.FileSetIndexContentType: currentFileBlockStart, volumeIndex, err = TimeAndVolumeIndexFromFileSetFilename(file) default: @@ -1233,6 +1287,23 @@ func SnapshotFileSetExistsAt(prefix string, namespace ident.ID, shard uint32, bl return latest.HasCompleteCheckpointFile(), nil } +// NextDataFileSetVolumeIndex returns the next data file set index for a +// given namespace/blockStart combination. +func NextDataFileSetVolumeIndex(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time) (int, error) { + files, err := DataFiles(filePathPrefix, namespace, shard) + if err != nil { + return 0, err + } + + latestFile, ok := files.LatestVolumeForBlock(blockStart) + if !ok { + // There isn't a fileset for this block yet; start at 0. + return 0, nil + } + + return latestFile.ID.VolumeIndex + 1, nil +} + // NextSnapshotMetadataFileIndex returns the next snapshot metadata file index. func NextSnapshotMetadataFileIndex(opts Options) (int64, error) { // We can ignore any SnapshotMetadataErrorsWithpaths that are returned because even if a corrupt @@ -1380,7 +1451,13 @@ func filesetPathFromTime(prefix string, t time.Time, suffix string) string { } func filesetPathFromTimeAndIndex(prefix string, t time.Time, index int, suffix string) string { - return path.Join(prefix, filesetFileForTime(t, fmt.Sprintf("%d%s%s", index, separator, suffix))) + var newSuffix string + if index == unindexedFilesetIndex { + newSuffix = suffix + } else { + newSuffix = fmt.Sprintf("%d%s%s", index, separator, suffix) + } + return path.Join(prefix, filesetFileForTime(t, newSuffix)) } func filesetIndexSegmentFileSuffixFromTime( diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index e16eeef31a..ac8ca63159 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -34,9 +34,9 @@ import ( "time" "github.com/m3db/m3/src/dbnode/digest" + "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/retention" - "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/x/ident" "github.com/m3db/m3/src/x/instrument" @@ -212,7 +212,7 @@ func TestForEachInfoFile(t *testing.T) { require.Equal(t, infoData, res) } -func TestTimeFromName(t *testing.T) { +func TestTimeFromFileName(t *testing.T) { _, err := TimeFromFileName("foo/bar") require.Error(t, err) require.Equal(t, "unexpected file name foo/bar", err.Error()) @@ -225,6 +225,11 @@ func TestTimeFromName(t *testing.T) { require.Equal(t, expected, v) require.NoError(t, err) + v, err = TimeFromFileName("foo-12345-6-bar.db") + expected = time.Unix(0, 12345) + require.Equal(t, expected, v) + require.NoError(t, err) + v, err = TimeFromFileName("foo/bar/foo-21234567890-bar.db") expected = time.Unix(0, 21234567890) require.Equal(t, expected, v) @@ -283,6 +288,41 @@ func TestTimeAndVolumeIndexFromFileSetFilename(t *testing.T) { require.Equal(t, filesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data"), validName) } +func TestTimeAndVolumeIndexFromDataFileSetFilename(t *testing.T) { + _, _, err := TimeAndVolumeIndexFromDataFileSetFilename("foo/bar") + require.Error(t, err) + require.Equal(t, "unexpected file name foo/bar", err.Error()) + + _, _, err = TimeAndVolumeIndexFromDataFileSetFilename("foo/bar-baz") + require.Error(t, err) + + type expected struct { + t time.Time + i int + } + ts, i, err := TimeAndVolumeIndexFromDataFileSetFilename("foo-1-0-data.db") + exp := expected{time.Unix(0, 1), 0} + require.Equal(t, exp.t, ts) + require.Equal(t, exp.i, i) + require.NoError(t, err) + + validName := "foo/bar/fileset-21234567890-1-data.db" + ts, i, err = TimeAndVolumeIndexFromDataFileSetFilename(validName) + exp = expected{time.Unix(0, 21234567890), 1} + require.Equal(t, exp.t, ts) + require.Equal(t, exp.i, i) + require.NoError(t, err) + require.Equal(t, filesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data"), validName) + + unindexedName := "foo/bar/fileset-21234567890-data.db" + ts, i, err = TimeAndVolumeIndexFromDataFileSetFilename(unindexedName) + exp = expected{time.Unix(0, 21234567890), -1} + require.Equal(t, exp.t, ts) + require.Equal(t, exp.i, i) + require.NoError(t, err) + require.Equal(t, filesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data"), unindexedName) +} + func TestSnapshotMetadataFilePathFromIdentifierRoundTrip(t *testing.T) { idUUID := uuid.Parse("bf58eb3e-0582-42ee-83b2-d098c206260e") require.NotNil(t, idUUID) @@ -746,6 +786,34 @@ func TestNextIndexFileSetVolumeIndex(t *testing.T) { } } +func TestNextDataFileSetVolumeIndex(t *testing.T) { + // Make empty directory + shard := uint32(0) + dir := createTempDir(t) + dataDir := ShardDataDirPath(dir, testNs1ID, shard) + require.NoError(t, os.MkdirAll(dataDir, 0755)) + defer os.RemoveAll(dataDir) + + blockStart := time.Now().Truncate(time.Hour) + + // Check increments properly + curr := -1 + for i := 0; i <= 10; i++ { + index, err := NextDataFileSetVolumeIndex(dir, testNs1ID, shard, blockStart) + require.NoError(t, err) + require.Equal(t, curr+1, index) + curr = index + + p := filesetPathFromTimeAndIndex(dataDir, blockStart, index, checkpointFileSuffix) + + digestBuf := digest.NewBuffer() + digestBuf.WriteDigest(digest.Checksum([]byte("bar"))) + + err = ioutil.WriteFile(p, digestBuf, defaultNewFileMode) + require.NoError(t, err) + } +} + func TestMultipleForBlockStart(t *testing.T) { numSnapshots := 20 numSnapshotsPerBlock := 4 diff --git a/src/dbnode/persist/fs/persist_manager.go b/src/dbnode/persist/fs/persist_manager.go index 41b9458ec2..118df8ad9c 100644 --- a/src/dbnode/persist/fs/persist_manager.go +++ b/src/dbnode/persist/fs/persist_manager.go @@ -422,8 +422,16 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. } var volumeIndex int - if opts.FileSetType == persist.FileSetSnapshotType { - // Need to work out the volume index for the next snapshot + switch opts.FileSetType { + case persist.FileSetFlushType: + // Need to work out the volume index for the next data file. + volumeIndex, err = NextDataFileSetVolumeIndex(pm.opts.FilePathPrefix(), + nsMetadata.ID(), shard, blockStart) + if err != nil { + return prepared, err + } + case persist.FileSetSnapshotType: + // Need to work out the volume index for the next snapshot. volumeIndex, err = NextSnapshotFileSetVolumeIndex(pm.opts.FilePathPrefix(), nsMetadata.ID(), shard, blockStart) if err != nil { @@ -453,7 +461,7 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. } if exists && opts.DeleteIfExists { - err := DeleteFileSetAt(pm.opts.FilePathPrefix(), nsID, shard, blockStart) + err := DeleteFileSetAt(pm.opts.FilePathPrefix(), nsID, shard, blockStart, volumeIndex) if err != nil { return prepared, err } diff --git a/src/dbnode/persist/fs/read.go b/src/dbnode/persist/fs/read.go index d594b7e5ca..9f0ceab00d 100644 --- a/src/dbnode/persist/fs/read.go +++ b/src/dbnode/persist/fs/read.go @@ -92,6 +92,7 @@ type reader struct { expectedDigestOfDigest uint32 expectedBloomFilterDigest uint32 shard uint32 + volume int open bool } @@ -128,11 +129,11 @@ func NewReader( func (r *reader) Open(opts DataReaderOpenOptions) error { var ( - namespace = opts.Identifier.Namespace - shard = opts.Identifier.Shard - blockStart = opts.Identifier.BlockStart - snapshotIndex = opts.Identifier.VolumeIndex - err error + namespace = opts.Identifier.Namespace + shard = opts.Identifier.Shard + blockStart = opts.Identifier.BlockStart + volumeIndex = opts.Identifier.VolumeIndex + err error ) var ( @@ -147,20 +148,20 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { switch opts.FileSetType { case persist.FileSetSnapshotType: shardDir = ShardSnapshotsDirPath(r.filePathPrefix, namespace, shard) - checkpointFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, checkpointFileSuffix) - infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, infoFileSuffix) - digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, digestFileSuffix) - bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, bloomFilterFileSuffix) - indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, indexFileSuffix) - dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, snapshotIndex, dataFileSuffix) + checkpointFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, checkpointFileSuffix) + infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix) + digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix) + bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix) + indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix) + dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) case persist.FileSetFlushType: shardDir = ShardDataDirPath(r.filePathPrefix, namespace, shard) - checkpointFilepath = filesetPathFromTime(shardDir, blockStart, checkpointFileSuffix) - infoFilepath = filesetPathFromTime(shardDir, blockStart, infoFileSuffix) - digestFilepath = filesetPathFromTime(shardDir, blockStart, digestFileSuffix) - bloomFilterFilepath = filesetPathFromTime(shardDir, blockStart, bloomFilterFileSuffix) - indexFilepath = filesetPathFromTime(shardDir, blockStart, indexFileSuffix) - dataFilepath = filesetPathFromTime(shardDir, blockStart, dataFileSuffix) + checkpointFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, checkpointFileSuffix) + infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix) + digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix) + bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix) + indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix) + dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) default: return fmt.Errorf("unable to open reader with fileset type: %s", opts.FileSetType) } @@ -237,6 +238,7 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { r.open = true r.namespace = namespace r.shard = shard + r.volume = volumeIndex return nil } @@ -246,6 +248,7 @@ func (r *reader) Status() DataFileSetReaderStatus { Open: r.open, Namespace: r.namespace, Shard: r.shard, + Volume: r.volume, BlockStart: r.start, } } diff --git a/src/dbnode/persist/fs/retriever.go b/src/dbnode/persist/fs/retriever.go index 0bfc295cbd..3c5bfbbf42 100644 --- a/src/dbnode/persist/fs/retriever.go +++ b/src/dbnode/persist/fs/retriever.go @@ -271,7 +271,9 @@ func (r *blockRetriever) fetchBatch( seekerResources ReusableSeekerResources, ) { // Resolve the seeker from the seeker mgr - seeker, err := seekerMgr.Borrow(shard, blockStart) + // TODO(juchan): actually get the correct volume. + vol := 0 + seeker, err := seekerMgr.Borrow(shard, blockStart, vol) if err != nil { for _, req := range reqs { req.onError(err) @@ -366,7 +368,8 @@ func (r *blockRetriever) fetchBatch( }(req) } - err = seekerMgr.Return(shard, blockStart, seeker) + // TODO(juchan): actually get the correct volume. + err = seekerMgr.Return(shard, blockStart, vol, seeker) if err != nil { r.logger.Error("err returning seeker for shard", zap.Uint32("shard", shard), @@ -408,7 +411,9 @@ func (r *blockRetriever) Stream( } r.RUnlock() - bloomFilter, err := r.seekerMgr.ConcurrentIDBloomFilter(shard, startTime) + // TODO(juchan): actually get the correct volume. + vol := 0 + bloomFilter, err := r.seekerMgr.ConcurrentIDBloomFilter(shard, startTime, vol) if err != nil { return xio.EmptyBlockReader, err } diff --git a/src/dbnode/persist/fs/seek.go b/src/dbnode/persist/fs/seek.go index 7dc0561dfc..1f2c3495c6 100644 --- a/src/dbnode/persist/fs/seek.go +++ b/src/dbnode/persist/fs/seek.go @@ -145,6 +145,7 @@ func (s *seeker) Open( namespace ident.ID, shard uint32, blockStart time.Time, + volume int, resources ReusableSeekerResources, ) error { if s.isClone { @@ -156,12 +157,12 @@ func (s *seeker) Open( // Open necessary files if err := openFiles(os.Open, map[string]**os.File{ - filesetPathFromTime(shardDir, blockStart, infoFileSuffix): &infoFd, - filesetPathFromTime(shardDir, blockStart, indexFileSuffix): &s.indexFd, - filesetPathFromTime(shardDir, blockStart, dataFileSuffix): &s.dataFd, - filesetPathFromTime(shardDir, blockStart, digestFileSuffix): &digestFd, - filesetPathFromTime(shardDir, blockStart, bloomFilterFileSuffix): &bloomFilterFd, - filesetPathFromTime(shardDir, blockStart, summariesFileSuffix): &summariesFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, infoFileSuffix): &infoFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, indexFileSuffix): &s.indexFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, dataFileSuffix): &s.dataFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, digestFileSuffix): &digestFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, bloomFilterFileSuffix): &bloomFilterFd, + filesetPathFromTimeAndIndex(shardDir, blockStart, volume, summariesFileSuffix): &summariesFd, }); err != nil { return err } diff --git a/src/dbnode/persist/fs/seek_manager.go b/src/dbnode/persist/fs/seek_manager.go index 074908f563..f6bf352c28 100644 --- a/src/dbnode/persist/fs/seek_manager.go +++ b/src/dbnode/persist/fs/seek_manager.go @@ -52,11 +52,12 @@ var ( errReturnedUnmanagedSeeker = errors.New("cant return a seeker not managed by the seeker manager") ) -type openAnyUnopenSeekersFn func(*seekersByTime) error +type openAnyUnopenSeekersFn func(*seekersByTimeAndVolume) error type newOpenSeekerFn func( shard uint32, blockStart time.Time, + volume int, ) (DataFileSetSeeker, error) type seekerManagerStatus int @@ -79,7 +80,7 @@ type seekerManager struct { filePathPrefix string status seekerManagerStatus - seekersByShardIdx []*seekersByTime + seekersByShardIdx []*seekersByTimeAndVolume namespace ident.ID namespaceMetadata namespace.Metadata unreadBuf seekerUnreadBuf @@ -96,8 +97,9 @@ type seekerUnreadBuf struct { value []byte } -// seekersAndBloom contains a slice of seekers for a given shard/blockStart. One of the seeker will be the original, -// and the others will be clones. The bloomFilter field is a reference to the underlying bloom filter that the +// seekersAndBloom contains a slice of seekers for a given shard/blockStart/volume. +// One of the seeker will be the original, and the others will be clones. The +// bloomFilter field is a reference to the underlying bloom filter that the // original seeker and all of its clones share. type seekersAndBloom struct { wg *sync.WaitGroup @@ -111,11 +113,35 @@ type borrowableSeeker struct { isBorrowed bool } -type seekersByTime struct { +type seekersByTimeAndVolume struct { sync.RWMutex shard uint32 accessed bool - seekers map[xtime.UnixNano]seekersAndBloom + seekers map[xtime.UnixNano]map[int]seekersAndBloom +} + +func (s *seekersByTimeAndVolume) remove(blockStart xtime.UnixNano, volume int) error { + s.Lock() + defer s.Unlock() + seekersByVolume, ok := s.seekers[blockStart] + if !ok { + return fmt.Errorf("seekers for blockStart %v not found", blockStart) + } + + seekers, ok := seekersByVolume[volume] + if !ok { + return fmt.Errorf("seekers for blockStart %v, volume %d not found", blockStart) + } + + // Delete seekers for specific volume. + delete(seekersByVolume, volume) + // If all seekers for all volumes for a blockStart has been removed, remove + // the blockStart from the map. + if len(seekersByVolume) == 0 { + delete(s.seekers, blockStart) + } + + return nil } type seekerManagerPendingClose struct { @@ -183,14 +209,14 @@ func (m *seekerManager) CacheShardIndices(shards []uint32) error { multiErr := xerrors.NewMultiError() for _, shard := range shards { - byTime := m.seekersByTime(shard) + byTimeAndVolume := m.seekersByTimeAndVolume(shard) - byTime.Lock() + byTimeAndVolume.Lock() // Track accessed to precache in open/close loop - byTime.accessed = true - byTime.Unlock() + byTimeAndVolume.accessed = true + byTimeAndVolume.Unlock() - if err := m.openAnyUnopenSeekersFn(byTime); err != nil { + if err := m.openAnyUnopenSeekersFn(byTimeAndVolume); err != nil { multiErr = multiErr.Add(err) } } @@ -198,35 +224,43 @@ func (m *seekerManager) CacheShardIndices(shards []uint32) error { return multiErr.FinalError() } -func (m *seekerManager) ConcurrentIDBloomFilter(shard uint32, start time.Time) (*ManagedConcurrentBloomFilter, error) { - byTime := m.seekersByTime(shard) +func (m *seekerManager) ConcurrentIDBloomFilter( + shard uint32, + start time.Time, + volume int, +) (*ManagedConcurrentBloomFilter, error) { + byTimeAndVolume := m.seekersByTimeAndVolume(shard) // Try fast RLock() first - byTime.RLock() + byTimeAndVolume.RLock() startNano := xtime.ToUnixNano(start) - seekersAndBloom, ok := byTime.seekers[startNano] - byTime.RUnlock() + seekersAndBloom, ok := byTimeAndVolume.seekers[startNano][volume] + byTimeAndVolume.RUnlock() if ok && seekersAndBloom.wg == nil { return seekersAndBloom.bloomFilter, nil } - byTime.Lock() - seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, byTime) - byTime.Unlock() + byTimeAndVolume.Lock() + seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, volume, byTimeAndVolume) + byTimeAndVolume.Unlock() return seekersAndBloom.bloomFilter, err } -func (m *seekerManager) Borrow(shard uint32, start time.Time) (ConcurrentDataFileSetSeeker, error) { - byTime := m.seekersByTime(shard) +func (m *seekerManager) Borrow( + shard uint32, + start time.Time, + volume int, +) (ConcurrentDataFileSetSeeker, error) { + byTimeAndVolume := m.seekersByTimeAndVolume(shard) - byTime.Lock() - defer byTime.Unlock() + byTimeAndVolume.Lock() + defer byTimeAndVolume.Unlock() // Track accessed to precache in open/close loop - byTime.accessed = true + byTimeAndVolume.accessed = true startNano := xtime.ToUnixNano(start) - seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, byTime) + seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, volume, byTimeAndVolume) if err != nil { return nil, err } @@ -252,14 +286,19 @@ func (m *seekerManager) Borrow(shard uint32, start time.Time) (ConcurrentDataFil return availableSeeker.seeker, nil } -func (m *seekerManager) Return(shard uint32, start time.Time, seeker ConcurrentDataFileSetSeeker) error { - byTime := m.seekersByTime(shard) +func (m *seekerManager) Return( + shard uint32, + start time.Time, + volume int, + seeker ConcurrentDataFileSetSeeker, +) error { + byTimeAndVolume := m.seekersByTimeAndVolume(shard) - byTime.Lock() - defer byTime.Unlock() + byTimeAndVolume.Lock() + defer byTimeAndVolume.Unlock() startNano := xtime.ToUnixNano(start) - seekersAndBloom, ok := byTime.seekers[startNano] + seekersAndBloom, ok := byTimeAndVolume.seekers[startNano][volume] // Should never happen - This either means that the caller (DataBlockRetriever) is trying to return seekers // that it never requested, OR its trying to return seekers after the openCloseLoop has already // determined that they were all no longer in use and safe to close. Either way it indicates there is @@ -298,7 +337,7 @@ func (m *seekerManager) UpdateOpenLease( } // getOrOpenSeekersWithLock checks if the seekers are already open / initialized. If they are, then it -// returns them. Then, it checks if a different goroutine is in the process of opening them , if so it +// returns them. Then, it checks if a different goroutine is in the process of opening them, if so it // registers itself as waiting until the other goroutine completes. If neither of those conditions occur, // then it begins the process of opening the seekers itself. First, it creates a waitgroup that other // goroutines can use so that they're notified when the seekers are open. This is useful because it allows @@ -306,8 +345,12 @@ func (m *seekerManager) UpdateOpenLease( // of the seekersByTime struct during a I/O heavy workload. Once the wg is created, we relinquish the lock, // open the Seeker (I/O heavy), re-acquire the lock (so that the waiting goroutines don't get it before us), // and then notify the waiting goroutines that we've finished. -func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *seekersByTime) (seekersAndBloom, error) { - seekers, ok := byTime.seekers[start] +func (m *seekerManager) getOrOpenSeekersWithLock( + start xtime.UnixNano, + volume int, + byTimeAndVolume *seekersByTimeAndVolume, +) (seekersAndBloom, error) { + seekers, ok := byTimeAndVolume.seekers[start][volume] if ok && seekers.wg == nil { // Seekers are already open return seekers, nil @@ -315,11 +358,11 @@ func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *s if seekers.wg != nil { // Seekers are being initialized / opened, wait for the that to complete - byTime.Unlock() + byTimeAndVolume.Unlock() seekers.wg.Wait() - byTime.Lock() + byTimeAndVolume.Lock() // Need to do the lookup again recursively to see the new state - return m.getOrOpenSeekersWithLock(start, byTime) + return m.getOrOpenSeekersWithLock(start, volume, byTimeAndVolume) } // Seekers need to be opened @@ -330,21 +373,21 @@ func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *s wg := &sync.WaitGroup{} seekers.wg = wg seekers.wg.Add(1) - byTime.seekers[start] = seekers - byTime.Unlock() + byTimeAndVolume.seekers[start][volume] = seekers + byTimeAndVolume.Unlock() // Open first one - Do this outside the context of the lock because opening // a seeker can be an expensive operation (validating index files) - seeker, err := m.newOpenSeekerFn(byTime.shard, start.ToTime()) + seeker, err := m.newOpenSeekerFn(byTimeAndVolume.shard, start.ToTime(), volume) // Immediately re-lock once the seeker is open regardless of errors because // thats the contract of this function - byTime.Lock() + byTimeAndVolume.Lock() // Call done after we re-acquire the lock so that callers who were waiting // won't get the lock before us. wg.Done() if err != nil { - // Delete the seekersByTime struct so that the process can be restarted if necessary - delete(byTime.seekers, start) + // Delete the seekersByTimeAndVolume struct so that the process can be restarted if necessary + byTimeAndVolume.remove(start, volume) return seekersAndBloom{}, err } @@ -360,7 +403,7 @@ func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *s multiErr = multiErr.Add(seeker.seeker.Close()) } // Delete the seekersByTime struct so that the process can be restarted if necessary - delete(byTime.seekers, start) + byTimeAndVolume.remove(start, volume) return seekersAndBloom{}, multiErr.FinalError() } borrowableSeekers = append(borrowableSeekers, borrowableSeeker{seeker: clone}) @@ -371,20 +414,20 @@ func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *s // Doesn't matter which seeker we pick to grab the bloom filter from, they all share the same underlying one. // Use index 0 because its guaranteed to be there. seekers.bloomFilter = borrowableSeekers[0].seeker.ConcurrentIDBloomFilter() - byTime.seekers[start] = seekers + byTimeAndVolume.seekers[start][volume] = seekers return seekers, nil } -func (m *seekerManager) openAnyUnopenSeekers(byTime *seekersByTime) error { +func (m *seekerManager) openAnyUnopenSeekers(byTimeAndVolume *seekersByTimeAndVolume) error { start := m.earliestSeekableBlockStart() end := m.latestSeekableBlockStart() blockSize := m.namespaceMetadata.Options().RetentionOptions().BlockSize() multiErr := xerrors.NewMultiError() for t := start; !t.After(end); t = t.Add(blockSize) { - byTime.Lock() - _, err := m.getOrOpenSeekersWithLock(xtime.ToUnixNano(t), byTime) - byTime.Unlock() + byTimeAndVolume.Lock() + _, err := m.getOrOpenSeekersWithLock(xtime.ToUnixNano(t), byTimeAndVolume) + byTimeAndVolume.Unlock() if err != nil && err != errSeekerManagerFileSetNotFound { multiErr = multiErr.Add(err) } @@ -396,6 +439,7 @@ func (m *seekerManager) openAnyUnopenSeekers(byTime *seekersByTime) error { func (m *seekerManager) newOpenSeeker( shard uint32, blockStart time.Time, + volume int, ) (DataFileSetSeeker, error) { exists, err := DataFileSetExistsAt(m.filePathPrefix, m.namespace, shard, blockStart) if err != nil { @@ -437,8 +481,7 @@ func (m *seekerManager) newOpenSeeker( return nil, fmt.Errorf("err opening latest lease: %v", err) } - // TODO_OUT_OF_ORDER_WRITES(rartoul): Pass volume obtained from OpenLatestLease here. - err = seeker.Open(m.namespace, shard, blockStart, resources) + err = seeker.Open(m.namespace, shard, blockStart, volume, resources) m.putSeekerResources(resources) if err != nil { return nil, err @@ -452,12 +495,12 @@ func (m *seekerManager) newOpenSeeker( return seeker, nil } -func (m *seekerManager) seekersByTime(shard uint32) *seekersByTime { +func (m *seekerManager) seekersByTimeAndVolume(shard uint32) *seekersByTimeAndVolume { m.RLock() if int(shard) < len(m.seekersByShardIdx) { - byTime := m.seekersByShardIdx[shard] + byTimeAndVolume := m.seekersByShardIdx[shard] m.RUnlock() - return byTime + return byTimeAndVolume } m.RUnlock() @@ -466,27 +509,27 @@ func (m *seekerManager) seekersByTime(shard uint32) *seekersByTime { // Check if raced with another call to this method if int(shard) < len(m.seekersByShardIdx) { - byTime := m.seekersByShardIdx[shard] - return byTime + byTimeAndVolume := m.seekersByShardIdx[shard] + return byTimeAndVolume } - seekersByShardIdx := make([]*seekersByTime, shard+1) + seekersByShardIdx := make([]*seekersByTimeAndVolume, shard+1) for i := range seekersByShardIdx { if i < len(m.seekersByShardIdx) { seekersByShardIdx[i] = m.seekersByShardIdx[i] continue } - seekersByShardIdx[i] = &seekersByTime{ + seekersByShardIdx[i] = &seekersByTimeAndVolume{ shard: uint32(i), - seekers: make(map[xtime.UnixNano]seekersAndBloom), + seekers: make(map[xtime.UnixNano]map[int]seekersAndBloom), } } m.seekersByShardIdx = seekersByShardIdx - byTime := m.seekersByShardIdx[shard] + byTimeAndVolume := m.seekersByShardIdx[shard] - return byTime + return byTimeAndVolume } func (m *seekerManager) Close() error { @@ -499,18 +542,20 @@ func (m *seekerManager) Close() error { // Make sure all seekers are returned before allowing the SeekerManager to be closed. // Actual cleanup of the seekers themselves will be handled by the openCloseLoop. - for _, byTime := range m.seekersByShardIdx { - byTime.Lock() - for _, seekersByTime := range byTime.seekers { - for _, seeker := range seekersByTime.seekers { - if seeker.isBorrowed { - byTime.Unlock() - m.Unlock() - return errCantCloseSeekerManagerWhileSeekersAreBorrowed + for _, byTimeAndVolume := range m.seekersByShardIdx { + byTimeAndVolume.Lock() + for _, seekersByTimeAndVolume := range byTimeAndVolume.seekers { + for _, seekersByVolume := range seekersByTimeAndVolume { + for _, seeker := range seekersByVolume.seekers { + if seeker.isBorrowed { + byTimeAndVolume.Unlock() + m.Unlock() + return errCantCloseSeekerManagerWhileSeekersAreBorrowed + } } } } - byTime.Unlock() + byTimeAndVolume.Unlock() } m.status = seekerManagerClosed @@ -547,7 +592,7 @@ func (m *seekerManager) latestSeekableBlockStart() time.Time { func (m *seekerManager) openCloseLoop() { var ( - shouldTryOpen []*seekersByTime + shouldTryOpen []*seekersByTimeAndVolume shouldClose []seekerManagerPendingClose closing []borrowableSeeker ) @@ -576,26 +621,26 @@ func (m *seekerManager) openCloseLoop() { break } - for _, byTime := range m.seekersByShardIdx { - byTime.RLock() - accessed := byTime.accessed - byTime.RUnlock() + for _, byTimeAndVolume := range m.seekersByShardIdx { + byTimeAndVolume.RLock() + accessed := byTimeAndVolume.accessed + byTimeAndVolume.RUnlock() if !accessed { continue } - shouldTryOpen = append(shouldTryOpen, byTime) + shouldTryOpen = append(shouldTryOpen, byTimeAndVolume) } m.RUnlock() // Try opening any unopened times for accessed seekers - for _, byTime := range shouldTryOpen { - m.openAnyUnopenSeekersFn(byTime) + for _, byTimeAndVolume := range shouldTryOpen { + m.openAnyUnopenSeekersFn(byTimeAndVolume) } m.RLock() - for shard, byTime := range m.seekersByShardIdx { - byTime.RLock() - for blockStartNano := range byTime.seekers { + for shard, byTimeAndVolume := range m.seekersByShardIdx { + byTimeAndVolume.RLock() + for blockStartNano := range byTimeAndVolume.seekers { blockStart := blockStartNano.ToTime() if blockStart.Before(earliestSeekableBlockStart) { shouldClose = append(shouldClose, seekerManagerPendingClose{ @@ -604,30 +649,32 @@ func (m *seekerManager) openCloseLoop() { }) } } - byTime.RUnlock() + byTimeAndVolume.RUnlock() } if len(shouldClose) > 0 { for _, elem := range shouldClose { - byTime := m.seekersByShardIdx[elem.shard] + byTimeAndVolume := m.seekersByShardIdx[elem.shard] blockStartNano := xtime.ToUnixNano(elem.blockStart) - byTime.Lock() - seekersAndBloom := byTime.seekers[blockStartNano] - allSeekersAreReturned := true - for _, seeker := range seekersAndBloom.seekers { - if seeker.isBorrowed { - allSeekersAreReturned = false - break + byTimeAndVolume.Lock() + byVolume := byTimeAndVolume.seekers[blockStartNano] + for vol, seekersAndBloom := range byVolume { + allSeekersAreReturned := true + for _, seeker := range seekersAndBloom.seekers { + if seeker.isBorrowed { + allSeekersAreReturned = false + break + } + } + // Never close seekers unless they've all been returned because + // some of them are clones of the original and can't be used once + // the parent is closed (because they share underlying resources) + if allSeekersAreReturned { + closing = append(closing, seekersAndBloom.seekers...) + byTimeAndVolume.remove(xtime.ToUnixNano(elem.blockStart), vol) } } - // Never close seekers unless they've all been returned because - // some of them are clones of the original and can't be used once - // the parent is closed (because they share underlying resources) - if allSeekersAreReturned { - closing = append(closing, seekersAndBloom.seekers...) - delete(byTime.seekers, blockStartNano) - } - byTime.Unlock() + byTimeAndVolume.Unlock() } } m.RUnlock() @@ -647,20 +694,22 @@ func (m *seekerManager) openCloseLoop() { // Release all resources m.Lock() - for _, byTime := range m.seekersByShardIdx { - byTime.Lock() - for _, seekersByTime := range byTime.seekers { - for _, seeker := range seekersByTime.seekers { - // We don't need to check if the seeker is borrowed here because we don't allow the - // SeekerManager to be closed if any seekers are still outstanding. - err := seeker.seeker.Close() - if err != nil { - m.logger.Error("err closing seeker in SeekerManager at end of openCloseLoop", zap.Error(err)) + for _, byTimeAndVolume := range m.seekersByShardIdx { + byTimeAndVolume.Lock() + for _, seekersByTimeAndVolume := range byTimeAndVolume.seekers { + for _, seekersByVolume := range seekersByTimeAndVolume { + for _, seeker := range seekersByVolume.seekers { + // We don't need to check if the seeker is borrowed here because we don't allow the + // SeekerManager to be closed if any seekers are still outstanding. + err := seeker.seeker.Close() + if err != nil { + m.logger.Error("err closing seeker in SeekerManager at end of openCloseLoop", zap.Error(err)) + } } } } - byTime.seekers = nil - byTime.Unlock() + byTimeAndVolume.seekers = nil + byTimeAndVolume.Unlock() } m.seekersByShardIdx = nil m.Unlock() diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go index b585e4cb99..068f697746 100644 --- a/src/dbnode/persist/fs/types.go +++ b/src/dbnode/persist/fs/types.go @@ -51,7 +51,7 @@ type FileSetFileIdentifier struct { BlockStart time.Time // Only required for data content files Shard uint32 - // Required for snapshot files (index yes, data yes) and flush files (index yes, data no) + // Required for snapshot files (index yes, data yes) and flush files (index yes, data yes) VolumeIndex int } @@ -105,9 +105,9 @@ type SnapshotMetadataFileReader interface { type DataFileSetReaderStatus struct { Namespace ident.ID BlockStart time.Time - - Shard uint32 - Open bool + Shard uint32 + Volume int + Open bool } // DataReaderOpenOptions is options struct for the reader open method. @@ -173,6 +173,7 @@ type DataFileSetSeeker interface { namespace ident.ID, shard uint32, start time.Time, + volume int, resources ReusableSeekerResources, ) error @@ -237,15 +238,15 @@ type DataFileSetSeekerManager interface { // to improve times when seeking to a block. CacheShardIndices(shards []uint32) error - // Borrow returns an open seeker for a given shard and block start time. - Borrow(shard uint32, start time.Time) (ConcurrentDataFileSetSeeker, error) + // Borrow returns an open seeker for a given shard, block start time, and volume. + Borrow(shard uint32, start time.Time, volume int) (ConcurrentDataFileSetSeeker, error) - // Return returns an open seeker for a given shard and block start time. - Return(shard uint32, start time.Time, seeker ConcurrentDataFileSetSeeker) error + // Return returns an open seeker for a given shard, block start time, and volume. + Return(shard uint32, start time.Time, volume int, seeker ConcurrentDataFileSetSeeker) error // ConcurrentIDBloomFilter returns a concurrent ID bloom filter for a given - // shard and block start time - ConcurrentIDBloomFilter(shard uint32, start time.Time) (*ManagedConcurrentBloomFilter, error) + // shard, block start time, and volume. + ConcurrentIDBloomFilter(shard uint32, start time.Time, volume int) (*ManagedConcurrentBloomFilter, error) } // DataBlockRetriever provides a block retriever for TSDB file sets diff --git a/src/dbnode/persist/fs/write.go b/src/dbnode/persist/fs/write.go index ebfe3e589f..622ca8e1af 100644 --- a/src/dbnode/persist/fs/write.go +++ b/src/dbnode/persist/fs/write.go @@ -145,11 +145,11 @@ func NewWriter(opts Options) (DataFileSetWriter, error) { // opening / truncating files associated with that shard for writing. func (w *writer) Open(opts DataWriterOpenOptions) error { var ( - nextSnapshotIndex int - err error - namespace = opts.Identifier.Namespace - shard = opts.Identifier.Shard - blockStart = opts.Identifier.BlockStart + err error + namespace = opts.Identifier.Namespace + shard = opts.Identifier.Shard + blockStart = opts.Identifier.BlockStart + nextVolumeIndex = opts.Identifier.VolumeIndex ) w.blockSize = opts.BlockSize @@ -178,27 +178,26 @@ func (w *writer) Open(opts DataWriterOpenOptions) error { return err } - nextSnapshotIndex = opts.Identifier.VolumeIndex - w.checkpointFilePath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, checkpointFileSuffix) - infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, infoFileSuffix) - indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, indexFileSuffix) - summariesFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, summariesFileSuffix) - bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, bloomFilterFileSuffix) - dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, dataFileSuffix) - digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextSnapshotIndex, digestFileSuffix) + w.checkpointFilePath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, checkpointFileSuffix) + infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, infoFileSuffix) + indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, indexFileSuffix) + summariesFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, summariesFileSuffix) + bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, bloomFilterFileSuffix) + dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, dataFileSuffix) + digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, digestFileSuffix) case persist.FileSetFlushType: shardDir = ShardDataDirPath(w.filePathPrefix, namespace, shard) if err := os.MkdirAll(shardDir, w.newDirectoryMode); err != nil { return err } - w.checkpointFilePath = filesetPathFromTime(shardDir, blockStart, checkpointFileSuffix) - infoFilepath = filesetPathFromTime(shardDir, blockStart, infoFileSuffix) - indexFilepath = filesetPathFromTime(shardDir, blockStart, indexFileSuffix) - summariesFilepath = filesetPathFromTime(shardDir, blockStart, summariesFileSuffix) - bloomFilterFilepath = filesetPathFromTime(shardDir, blockStart, bloomFilterFileSuffix) - dataFilepath = filesetPathFromTime(shardDir, blockStart, dataFileSuffix) - digestFilepath = filesetPathFromTime(shardDir, blockStart, digestFileSuffix) + w.checkpointFilePath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, checkpointFileSuffix) + infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, infoFileSuffix) + indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, indexFileSuffix) + summariesFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, summariesFileSuffix) + bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, bloomFilterFileSuffix) + dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, dataFileSuffix) + digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, digestFileSuffix) default: return fmt.Errorf("unable to open reader with fileset type: %s", opts.FileSetType) } diff --git a/src/dbnode/storage/namespace_readers.go b/src/dbnode/storage/namespace_readers.go index 727f6a610e..3d0e6ffae3 100644 --- a/src/dbnode/storage/namespace_readers.go +++ b/src/dbnode/storage/namespace_readers.go @@ -24,8 +24,8 @@ import ( "sync" "time" - "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/x/ident" "github.com/m3db/m3/src/x/pool" xtime "github.com/m3db/m3/src/x/time" @@ -66,6 +66,7 @@ type databaseNamespaceReaderManager interface { get( shard uint32, blockStart time.Time, + volume int, position readerPosition, ) (fs.DataFileSetReader, error) @@ -109,6 +110,7 @@ type namespaceReaderManager struct { type cachedOpenReaderKey struct { shard uint32 blockStart xtime.UnixNano + volume int position readerPosition } @@ -232,12 +234,14 @@ func (m *namespaceReaderManager) cachedReaderForKey( func (m *namespaceReaderManager) get( shard uint32, blockStart time.Time, + volume int, position readerPosition, ) (fs.DataFileSetReader, error) { key := cachedOpenReaderKey{ shard: shard, blockStart: xtime.ToUnixNano(blockStart), position: position, + volume: volume, } lookup, err := m.cachedReaderForKey(key) @@ -253,9 +257,10 @@ func (m *namespaceReaderManager) get( reader := lookup.closedReader openOpts := fs.DataReaderOpenOptions{ Identifier: fs.FileSetFileIdentifier{ - Namespace: m.namespace.ID(), - Shard: shard, - BlockStart: blockStart, + Namespace: m.namespace.ID(), + Shard: shard, + BlockStart: blockStart, + VolumeIndex: volume, }, } if err := reader.Open(openOpts); err != nil { @@ -303,6 +308,7 @@ func (m *namespaceReaderManager) put(reader fs.DataFileSetReader) { key := cachedOpenReaderKey{ shard: status.Shard, blockStart: xtime.ToUnixNano(status.BlockStart), + volume: status.Volume, position: readerPosition{ dataIdx: reader.EntriesRead(), metadataIdx: reader.MetadataRead(), diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index 26ea4373fb..5d5303e8c1 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -1687,8 +1687,10 @@ func (s *dbShard) FetchBlocksMetadataV2( pos.metadataIdx = int(flushedPhase.CurrBlockEntryIdx) } + // TODO(juchan): actually get the volume + vol := 0 // Open a reader at this position, potentially from cache - reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, pos) + reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, vol, pos) if err != nil { return nil, nil, err } From 4c11e634584b1a76e99060256bc0c05a8c3beaf3 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Thu, 13 Jun 2019 10:01:12 -0400 Subject: [PATCH 02/12] Remove seek manager changes regarding volume; clean up logic --- src/dbnode/persist/fs/files.go | 171 ++++++++---- src/dbnode/persist/fs/files_test.go | 50 +--- src/dbnode/persist/fs/merger.go | 2 + src/dbnode/persist/fs/persist_manager.go | 28 +- src/dbnode/persist/fs/read.go | 38 +-- src/dbnode/persist/fs/retriever.go | 11 +- src/dbnode/persist/fs/seek.go | 13 +- src/dbnode/persist/fs/seek_manager.go | 262 +++++++----------- src/dbnode/persist/fs/types.go | 7 +- src/dbnode/persist/fs/write.go | 14 +- src/dbnode/persist/types.go | 15 +- .../bootstrap/bootstrapper/peers/source.go | 8 + src/dbnode/storage/cleanup.go | 16 +- src/dbnode/storage/cleanup_test.go | 3 + src/dbnode/storage/index.go | 2 +- src/dbnode/storage/namespace_readers.go | 12 +- src/dbnode/storage/shard.go | 113 +++++--- src/dbnode/storage/shard_test.go | 2 +- src/dbnode/storage/types.go | 5 + 19 files changed, 396 insertions(+), 376 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 541d53238e..56dd3b1ca1 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -58,9 +58,12 @@ const ( snapshotDirName = "snapshots" commitLogsDirName = "commitlogs" - // Filesets that are not indexed should be ordered before 0-indexed fileset - // files. - unindexedFilesetIndex = -1 + // The volume index assigned to (legacy) filesets that don't have a volume + // number in their filename. + // NOTE: Since this index is the same as the index for the first + // (non-legacy) fileset, receiving an index of 0 means that we need to + // check for both indexed and non-indexed filenames. + unindexedFilesetIndex = 0 commitLogComponentPosition = 2 indexFileSetComponentPosition = 2 @@ -178,9 +181,9 @@ func (f FileSetFilesSlice) Filepaths() []string { } // LatestVolumeForBlock returns the latest (highest index) FileSetFile in the -// slice for a given block start; applicable for data, index and snapshot file set files. +// slice for a given block start. func (f FileSetFilesSlice) LatestVolumeForBlock(blockStart time.Time) (FileSetFile, bool) { - // Make sure we're already sorted + // Make sure we're already sorted. f.sortByTimeAndVolumeIndexAscending() for i, curr := range f { @@ -211,6 +214,18 @@ func (f FileSetFilesSlice) LatestVolumeForBlock(blockStart time.Time) (FileSetFi return FileSetFile{}, false } +// VolumeExistsForBlock returns whether there is a valid FileSetFile for the +// given block start and volume index. +func (f FileSetFilesSlice) VolumeExistsForBlock(blockStart time.Time, volume int) bool { + for _, curr := range f { + if curr.ID.BlockStart.Equal(blockStart) && curr.ID.VolumeIndex == volume { + return curr.HasCompleteCheckpointFile() + } + } + + return false +} + // ignores the index in the FileSetFileIdentifier because fileset files should // always have index 0. func (f FileSetFilesSlice) sortByTimeAscending() { @@ -540,6 +555,27 @@ func readSnapshotInfoFile(filePathPrefix string, id FileSetFileIdentifier, reade infoFilePath, readerBufferSize, expectedInfoDigest) } +func readCheckpointFile(filePath string, digestBuf digest.Buffer) (uint32, error) { + exists, err := CompleteCheckpointFileExists(filePath) + if err != nil { + return 0, err + } + if !exists { + return 0, ErrCheckpointFileNotFound + } + fd, err := os.Open(filePath) + if err != nil { + return 0, err + } + defer fd.Close() + digest, err := digestBuf.ReadDigestFromFile(fd) + if err != nil { + return 0, err + } + + return digest, nil +} + type forEachInfoFileSelector struct { fileSetType persist.FileSetType contentType persist.FileSetContentType @@ -597,6 +633,10 @@ func forEachInfoFile( t := matched[i].ID.BlockStart volume := matched[i].ID.VolumeIndex + isLegacy, err := isFirstVolumeLegacy(dir, t, checkpointFileSuffix) + if err != nil { + continue + } var ( checkpointFilePath string digestsFilePath string @@ -606,9 +646,9 @@ func forEachInfoFile( case persist.FileSetFlushType: switch args.contentType { case persist.FileSetDataContentType: - checkpointFilePath = filesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix) - digestsFilePath = filesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix) - infoFilePath = filesetPathFromTimeAndIndex(dir, t, volume, infoFileSuffix) + checkpointFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix, isLegacy) + digestsFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix, isLegacy) + infoFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, infoFileSuffix, isLegacy) case persist.FileSetIndexContentType: checkpointFilePath = filesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix) digestsFilePath = filesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix) @@ -619,21 +659,8 @@ func forEachInfoFile( digestsFilePath = filesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix) infoFilePath = filesetPathFromTimeAndIndex(dir, t, volume, infoFileSuffix) } - checkpointExists, err := CompleteCheckpointFileExists(checkpointFilePath) - if err != nil { - continue - } - if !checkpointExists { - continue - } - checkpointFd, err := os.Open(checkpointFilePath) - if err != nil { - continue - } - // Read digest of digests from the checkpoint file - expectedDigestOfDigest, err := digestBuf.ReadDigestFromFile(checkpointFd) - checkpointFd.Close() + expectedDigestOfDigest, err := readCheckpointFile(checkpointFilePath, digestBuf) if err != nil { continue } @@ -896,8 +923,8 @@ func FileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockSta } matched.sortByTimeAndVolumeIndexAscending() - for i, fileset := range matched { - if fileset.ID.BlockStart.Equal(blockStart) { + for _, fileset := range matched { + if fileset.ID.BlockStart.Equal(blockStart) && fileset.ID.VolumeIndex == volume { if !fileset.HasCompleteCheckpointFile() { continue } @@ -950,8 +977,8 @@ func DeleteFileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, bl return DeleteFiles(fileset.AbsoluteFilepaths) } -// DataFileSetsBefore returns all the flush data fileset files whose timestamps are earlier than a given time. -func DataFileSetsBefore(filePathPrefix string, namespace ident.ID, shard uint32, t time.Time) ([]string, error) { +// DataFilePathsBefore returns all the flush data fileset paths whose timestamps are earlier than a given time. +func DataFilePathsBefore(filePathPrefix string, namespace ident.ID, shard uint32, t time.Time) ([]string, error) { matched, err := filesetFiles(filesetFilesSelector{ fileSetType: persist.FileSetFlushType, contentType: persist.FileSetDataContentType, @@ -966,8 +993,8 @@ func DataFileSetsBefore(filePathPrefix string, namespace ident.ID, shard uint32, return FilesBefore(matched.Filepaths(), t) } -// IndexFileSetsBefore returns all the flush index fileset files whose timestamps are earlier than a given time. -func IndexFileSetsBefore(filePathPrefix string, namespace ident.ID, t time.Time) ([]string, error) { +// IndexFilePathsBefore returns all the flush index fileset paths whose timestamps are earlier than a given time. +func IndexFilePathsBefore(filePathPrefix string, namespace ident.ID, t time.Time) ([]string, error) { matched, err := filesetFiles(filesetFilesSelector{ fileSetType: persist.FileSetFlushType, contentType: persist.FileSetIndexContentType, @@ -1267,41 +1294,49 @@ func CommitLogsDirPath(prefix string) string { // DataFileSetExistsAt determines whether data fileset files exist for the given namespace, shard, and block start. func DataFileSetExistsAt(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time) (bool, error) { - shardDir := ShardDataDirPath(filePathPrefix, namespace, shard) - checkpointPath := filesetPathFromTime(shardDir, blockStart, checkpointFileSuffix) - return CompleteCheckpointFileExists(checkpointPath) -} - -// SnapshotFileSetExistsAt determines whether snapshot fileset files exist for the given namespace, shard, and block start time. -func SnapshotFileSetExistsAt(prefix string, namespace ident.ID, shard uint32, blockStart time.Time) (bool, error) { - snapshotFiles, err := SnapshotFiles(prefix, namespace, shard) + dataFiles, err := DataFiles(filePathPrefix, namespace, shard) if err != nil { return false, err } - latest, ok := snapshotFiles.LatestVolumeForBlock(blockStart) + _, ok := dataFiles.LatestVolumeForBlock(blockStart) if !ok { return false, nil } - return latest.HasCompleteCheckpointFile(), nil + return true, nil } -// NextDataFileSetVolumeIndex returns the next data file set index for a -// given namespace/blockStart combination. -func NextDataFileSetVolumeIndex(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time) (int, error) { - files, err := DataFiles(filePathPrefix, namespace, shard) +// DataFileSetExists determines whether data fileset files exist for the given +// namespace, shard, block start, and volume. +func DataFileSetExists( + filePathPrefix string, + namespace ident.ID, + shard uint32, + blockStart time.Time, + volume int, +) (bool, error) { + dataFiles, err := DataFiles(filePathPrefix, namespace, shard) if err != nil { - return 0, err + return false, err } - latestFile, ok := files.LatestVolumeForBlock(blockStart) + return dataFiles.VolumeExistsForBlock(blockStart, volume), nil +} + +// SnapshotFileSetExistsAt determines whether snapshot fileset files exist for the given namespace, shard, and block start time. +func SnapshotFileSetExistsAt(prefix string, namespace ident.ID, shard uint32, blockStart time.Time) (bool, error) { + snapshotFiles, err := SnapshotFiles(prefix, namespace, shard) + if err != nil { + return false, err + } + + _, ok := snapshotFiles.LatestVolumeForBlock(blockStart) if !ok { - // There isn't a fileset for this block yet; start at 0. - return 0, nil + return false, nil } - return latestFile.ID.VolumeIndex + 1, nil + return true, nil } // NextSnapshotMetadataFileIndex returns the next snapshot metadata file index. @@ -1451,15 +1486,45 @@ func filesetPathFromTime(prefix string, t time.Time, suffix string) string { } func filesetPathFromTimeAndIndex(prefix string, t time.Time, index int, suffix string) string { - var newSuffix string - if index == unindexedFilesetIndex { - newSuffix = suffix - } else { - newSuffix = fmt.Sprintf("%d%s%s", index, separator, suffix) - } + newSuffix := fmt.Sprintf("%d%s%s", index, separator, suffix) return path.Join(prefix, filesetFileForTime(t, newSuffix)) } +// isFirstVolumeLegacy returns whether the first volume of the provided type is +// legacy, i.e. does not have a volume index in its filename. It only checks for +// the existence of the file and the non-legacy version of the file, and returns +// and error if neither exist. +func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error) { + path := filesetPathFromTimeAndIndex(prefix, t, 0, suffix) + _, err := os.Stat(path) + if err == nil { + return false, nil + } + + legacyPath := filesetPathFromTime(prefix, t, suffix) + _, err = os.Stat(legacyPath) + if err == nil { + return true, nil + } + + return false, fmt.Errorf("first volume does not exist for prefix: %s, time: %v, suffix: %s", + prefix, t, suffix) +} + +func dataFilesetPathFromTimeAndIndex( + prefix string, + t time.Time, + index int, + suffix string, + isLegacy bool, +) string { + if isLegacy { + return filesetPathFromTime(prefix, t, suffix) + } + + return filesetPathFromTimeAndIndex(prefix, t, index, suffix) +} + func filesetIndexSegmentFileSuffixFromTime( t time.Time, segmentIndex int, diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index ac8ca63159..9837cca30d 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -312,15 +312,15 @@ func TestTimeAndVolumeIndexFromDataFileSetFilename(t *testing.T) { require.Equal(t, exp.t, ts) require.Equal(t, exp.i, i) require.NoError(t, err) - require.Equal(t, filesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data"), validName) + require.Equal(t, dataFilesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data", false), validName) unindexedName := "foo/bar/fileset-21234567890-data.db" ts, i, err = TimeAndVolumeIndexFromDataFileSetFilename(unindexedName) - exp = expected{time.Unix(0, 21234567890), -1} + exp = expected{time.Unix(0, 21234567890), 0} require.Equal(t, exp.t, ts) require.Equal(t, exp.i, i) require.NoError(t, err) - require.Equal(t, filesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data"), unindexedName) + require.Equal(t, dataFilesetPathFromTimeAndIndex("foo/bar", exp.t, exp.i, "data", true), unindexedName) } func TestSnapshotMetadataFilePathFromIdentifierRoundTrip(t *testing.T) { @@ -475,7 +475,7 @@ func TestFileSetFilesBefore(t *testing.T) { cutoffIter := 8 cutoff := time.Unix(0, int64(cutoffIter)) - res, err := DataFileSetsBefore(dir, testNs1ID, shard, cutoff) + res, err := DataFilePathsBefore(dir, testNs1ID, shard, cutoff) require.NoError(t, err) require.Equal(t, cutoffIter, len(res)) @@ -494,7 +494,7 @@ func TestFileSetAt(t *testing.T) { for i := 0; i < numIters; i++ { timestamp := time.Unix(0, int64(i)) - res, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp) + res, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) require.True(t, ok) require.Equal(t, timestamp, res.ID.BlockStart) @@ -509,7 +509,7 @@ func TestFileSetAtIgnoresWithoutCheckpoint(t *testing.T) { for i := 0; i < numIters; i++ { timestamp := time.Unix(0, int64(i)) - _, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp) + _, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) require.False(t, ok) } @@ -523,15 +523,15 @@ func TestDeleteFileSetAt(t *testing.T) { for i := 0; i < numIters; i++ { timestamp := time.Unix(0, int64(i)) - res, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp) + res, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) require.True(t, ok) require.Equal(t, timestamp, res.ID.BlockStart) - err = DeleteFileSetAt(dir, testNs1ID, shard, timestamp) + err = DeleteFileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) - res, ok, err = FileSetAt(dir, testNs1ID, shard, timestamp) + res, ok, err = FileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) require.False(t, ok) } @@ -543,7 +543,7 @@ func TestFileSetAtNotExist(t *testing.T) { defer os.RemoveAll(dir) timestamp := time.Unix(0, 0) - _, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp) + _, ok, err := FileSetAt(dir, testNs1ID, shard, timestamp, 0) require.NoError(t, err) require.False(t, ok) } @@ -786,34 +786,6 @@ func TestNextIndexFileSetVolumeIndex(t *testing.T) { } } -func TestNextDataFileSetVolumeIndex(t *testing.T) { - // Make empty directory - shard := uint32(0) - dir := createTempDir(t) - dataDir := ShardDataDirPath(dir, testNs1ID, shard) - require.NoError(t, os.MkdirAll(dataDir, 0755)) - defer os.RemoveAll(dataDir) - - blockStart := time.Now().Truncate(time.Hour) - - // Check increments properly - curr := -1 - for i := 0; i <= 10; i++ { - index, err := NextDataFileSetVolumeIndex(dir, testNs1ID, shard, blockStart) - require.NoError(t, err) - require.Equal(t, curr+1, index) - curr = index - - p := filesetPathFromTimeAndIndex(dataDir, blockStart, index, checkpointFileSuffix) - - digestBuf := digest.NewBuffer() - digestBuf.WriteDigest(digest.Checksum([]byte("bar"))) - - err = ioutil.WriteFile(p, digestBuf, defaultNewFileMode) - require.NoError(t, err) - } -} - func TestMultipleForBlockStart(t *testing.T) { numSnapshots := 20 numSnapshotsPerBlock := 4 @@ -1101,7 +1073,7 @@ func TestIndexFileSetsBefore(t *testing.T) { } files.create(t, dir) - results, err := IndexFileSetsBefore(dir, ns1, timeFor(3)) + results, err := IndexFilePathsBefore(dir, ns1, timeFor(3)) require.NoError(t, err) require.Len(t, results, 3) for _, res := range results { diff --git a/src/dbnode/persist/fs/merger.go b/src/dbnode/persist/fs/merger.go index e6b473ae26..6b3a8e5c95 100644 --- a/src/dbnode/persist/fs/merger.go +++ b/src/dbnode/persist/fs/merger.go @@ -128,6 +128,8 @@ func (m *merger) Merge( NamespaceMetadata: nsMd, Shard: shard, BlockStart: startTime, + VolumeIndex: fileID.VolumeIndex + 1, + FileSetType: persist.FileSetFlushType, DeleteIfExists: false, } prepared, err := flushPreparer.PrepareData(prepareOpts) diff --git a/src/dbnode/persist/fs/persist_manager.go b/src/dbnode/persist/fs/persist_manager.go index 118df8ad9c..d7ce286125 100644 --- a/src/dbnode/persist/fs/persist_manager.go +++ b/src/dbnode/persist/fs/persist_manager.go @@ -416,7 +416,7 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. return prepared, errPersistManagerCannotPrepareDataNotPersisting } - exists, err := pm.dataFilesetExistsAt(opts) + exists, err := pm.dataFilesetExists(opts) if err != nil { return prepared, err } @@ -424,12 +424,9 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. var volumeIndex int switch opts.FileSetType { case persist.FileSetFlushType: - // Need to work out the volume index for the next data file. - volumeIndex, err = NextDataFileSetVolumeIndex(pm.opts.FilePathPrefix(), - nsMetadata.ID(), shard, blockStart) - if err != nil { - return prepared, err - } + // Use the volume index passed in. This ensures that this index is the + // same as the flush index. + volumeIndex = opts.VolumeIndex case persist.FileSetSnapshotType: // Need to work out the volume index for the next snapshot. volumeIndex, err = NextSnapshotFileSetVolumeIndex(pm.opts.FilePathPrefix(), @@ -606,20 +603,25 @@ func (pm *persistManager) doneShared() error { return nil } -func (pm *persistManager) dataFilesetExistsAt(prepareOpts persist.DataPrepareOptions) (bool, error) { +func (pm *persistManager) dataFilesetExists(prepareOpts persist.DataPrepareOptions) (bool, error) { var ( - blockStart = prepareOpts.BlockStart - shard = prepareOpts.Shard nsID = prepareOpts.NamespaceMetadata.ID() + shard = prepareOpts.Shard + blockStart = prepareOpts.BlockStart + volume = prepareOpts.VolumeIndex ) switch prepareOpts.FileSetType { case persist.FileSetSnapshotType: - // Snapshot files are indexed (multiple per block-start), so checking if the file - // already exist doesn't make much sense + // Checking if a snapshot file exists for a block start doesn't make + // sense in this context because the logic for creating new snapshot + // files does not use the volume index provided in the prepareOpts. + // Instead, the new volume index is determined by looking at what files + // exist on disk. This means that there can never be a conflict when + // trying to write new snapshot files. return false, nil case persist.FileSetFlushType: - return DataFileSetExistsAt(pm.filePathPrefix, nsID, shard, blockStart) + return DataFileSetExists(pm.filePathPrefix, nsID, shard, blockStart, volume) default: return false, fmt.Errorf( "unable to determine if fileset exists in persist manager for fileset type: %s", diff --git a/src/dbnode/persist/fs/read.go b/src/dbnode/persist/fs/read.go index 9f0ceab00d..863fb498a6 100644 --- a/src/dbnode/persist/fs/read.go +++ b/src/dbnode/persist/fs/read.go @@ -145,6 +145,11 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { indexFilepath string dataFilepath string ) + + isLegacy, err := isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) + if err != nil { + return err + } switch opts.FileSetType { case persist.FileSetSnapshotType: shardDir = ShardSnapshotsDirPath(r.filePathPrefix, namespace, shard) @@ -156,12 +161,12 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) case persist.FileSetFlushType: shardDir = ShardDataDirPath(r.filePathPrefix, namespace, shard) - checkpointFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, checkpointFileSuffix) - infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix) - digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix) - bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix) - indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix) - dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) + checkpointFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, checkpointFileSuffix, isLegacy) + infoFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix, isLegacy) + digestFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix, isLegacy) + bloomFilterFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix, isLegacy) + indexFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix, isLegacy) + dataFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix, isLegacy) default: return fmt.Errorf("unable to open reader with fileset type: %s", opts.FileSetType) } @@ -500,27 +505,6 @@ func (r *reader) Close() error { return multiErr.FinalError() } -func readCheckpointFile(filePath string, digestBuf digest.Buffer) (uint32, error) { - exists, err := CompleteCheckpointFileExists(filePath) - if err != nil { - return 0, err - } - if !exists { - return 0, ErrCheckpointFileNotFound - } - fd, err := os.Open(filePath) - if err != nil { - return 0, err - } - defer fd.Close() - digest, err := digestBuf.ReadDigestFromFile(fd) - if err != nil { - return 0, err - } - - return digest, nil -} - // indexEntriesByOffsetAsc implements sort.Sort type indexEntriesByOffsetAsc []schema.IndexEntry diff --git a/src/dbnode/persist/fs/retriever.go b/src/dbnode/persist/fs/retriever.go index 3c5bfbbf42..0bfc295cbd 100644 --- a/src/dbnode/persist/fs/retriever.go +++ b/src/dbnode/persist/fs/retriever.go @@ -271,9 +271,7 @@ func (r *blockRetriever) fetchBatch( seekerResources ReusableSeekerResources, ) { // Resolve the seeker from the seeker mgr - // TODO(juchan): actually get the correct volume. - vol := 0 - seeker, err := seekerMgr.Borrow(shard, blockStart, vol) + seeker, err := seekerMgr.Borrow(shard, blockStart) if err != nil { for _, req := range reqs { req.onError(err) @@ -368,8 +366,7 @@ func (r *blockRetriever) fetchBatch( }(req) } - // TODO(juchan): actually get the correct volume. - err = seekerMgr.Return(shard, blockStart, vol, seeker) + err = seekerMgr.Return(shard, blockStart, seeker) if err != nil { r.logger.Error("err returning seeker for shard", zap.Uint32("shard", shard), @@ -411,9 +408,7 @@ func (r *blockRetriever) Stream( } r.RUnlock() - // TODO(juchan): actually get the correct volume. - vol := 0 - bloomFilter, err := r.seekerMgr.ConcurrentIDBloomFilter(shard, startTime, vol) + bloomFilter, err := r.seekerMgr.ConcurrentIDBloomFilter(shard, startTime) if err != nil { return xio.EmptyBlockReader, err } diff --git a/src/dbnode/persist/fs/seek.go b/src/dbnode/persist/fs/seek.go index 1f2c3495c6..7dc0561dfc 100644 --- a/src/dbnode/persist/fs/seek.go +++ b/src/dbnode/persist/fs/seek.go @@ -145,7 +145,6 @@ func (s *seeker) Open( namespace ident.ID, shard uint32, blockStart time.Time, - volume int, resources ReusableSeekerResources, ) error { if s.isClone { @@ -157,12 +156,12 @@ func (s *seeker) Open( // Open necessary files if err := openFiles(os.Open, map[string]**os.File{ - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, infoFileSuffix): &infoFd, - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, indexFileSuffix): &s.indexFd, - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, dataFileSuffix): &s.dataFd, - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, digestFileSuffix): &digestFd, - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, bloomFilterFileSuffix): &bloomFilterFd, - filesetPathFromTimeAndIndex(shardDir, blockStart, volume, summariesFileSuffix): &summariesFd, + filesetPathFromTime(shardDir, blockStart, infoFileSuffix): &infoFd, + filesetPathFromTime(shardDir, blockStart, indexFileSuffix): &s.indexFd, + filesetPathFromTime(shardDir, blockStart, dataFileSuffix): &s.dataFd, + filesetPathFromTime(shardDir, blockStart, digestFileSuffix): &digestFd, + filesetPathFromTime(shardDir, blockStart, bloomFilterFileSuffix): &bloomFilterFd, + filesetPathFromTime(shardDir, blockStart, summariesFileSuffix): &summariesFd, }); err != nil { return err } diff --git a/src/dbnode/persist/fs/seek_manager.go b/src/dbnode/persist/fs/seek_manager.go index f6bf352c28..85b3650da6 100644 --- a/src/dbnode/persist/fs/seek_manager.go +++ b/src/dbnode/persist/fs/seek_manager.go @@ -52,12 +52,11 @@ var ( errReturnedUnmanagedSeeker = errors.New("cant return a seeker not managed by the seeker manager") ) -type openAnyUnopenSeekersFn func(*seekersByTimeAndVolume) error +type openAnyUnopenSeekersFn func(*seekersByTime) error type newOpenSeekerFn func( shard uint32, blockStart time.Time, - volume int, ) (DataFileSetSeeker, error) type seekerManagerStatus int @@ -80,7 +79,7 @@ type seekerManager struct { filePathPrefix string status seekerManagerStatus - seekersByShardIdx []*seekersByTimeAndVolume + seekersByShardIdx []*seekersByTime namespace ident.ID namespaceMetadata namespace.Metadata unreadBuf seekerUnreadBuf @@ -97,9 +96,8 @@ type seekerUnreadBuf struct { value []byte } -// seekersAndBloom contains a slice of seekers for a given shard/blockStart/volume. -// One of the seeker will be the original, and the others will be clones. The -// bloomFilter field is a reference to the underlying bloom filter that the +// seekersAndBloom contains a slice of seekers for a given shard/blockStart. One of the seeker will be the original, +// and the others will be clones. The bloomFilter field is a reference to the underlying bloom filter that the // original seeker and all of its clones share. type seekersAndBloom struct { wg *sync.WaitGroup @@ -113,35 +111,11 @@ type borrowableSeeker struct { isBorrowed bool } -type seekersByTimeAndVolume struct { +type seekersByTime struct { sync.RWMutex shard uint32 accessed bool - seekers map[xtime.UnixNano]map[int]seekersAndBloom -} - -func (s *seekersByTimeAndVolume) remove(blockStart xtime.UnixNano, volume int) error { - s.Lock() - defer s.Unlock() - seekersByVolume, ok := s.seekers[blockStart] - if !ok { - return fmt.Errorf("seekers for blockStart %v not found", blockStart) - } - - seekers, ok := seekersByVolume[volume] - if !ok { - return fmt.Errorf("seekers for blockStart %v, volume %d not found", blockStart) - } - - // Delete seekers for specific volume. - delete(seekersByVolume, volume) - // If all seekers for all volumes for a blockStart has been removed, remove - // the blockStart from the map. - if len(seekersByVolume) == 0 { - delete(s.seekers, blockStart) - } - - return nil + seekers map[xtime.UnixNano]seekersAndBloom } type seekerManagerPendingClose struct { @@ -209,14 +183,14 @@ func (m *seekerManager) CacheShardIndices(shards []uint32) error { multiErr := xerrors.NewMultiError() for _, shard := range shards { - byTimeAndVolume := m.seekersByTimeAndVolume(shard) + byTime := m.seekersByTime(shard) - byTimeAndVolume.Lock() + byTime.Lock() // Track accessed to precache in open/close loop - byTimeAndVolume.accessed = true - byTimeAndVolume.Unlock() + byTime.accessed = true + byTime.Unlock() - if err := m.openAnyUnopenSeekersFn(byTimeAndVolume); err != nil { + if err := m.openAnyUnopenSeekersFn(byTime); err != nil { multiErr = multiErr.Add(err) } } @@ -224,43 +198,35 @@ func (m *seekerManager) CacheShardIndices(shards []uint32) error { return multiErr.FinalError() } -func (m *seekerManager) ConcurrentIDBloomFilter( - shard uint32, - start time.Time, - volume int, -) (*ManagedConcurrentBloomFilter, error) { - byTimeAndVolume := m.seekersByTimeAndVolume(shard) +func (m *seekerManager) ConcurrentIDBloomFilter(shard uint32, start time.Time) (*ManagedConcurrentBloomFilter, error) { + byTime := m.seekersByTime(shard) // Try fast RLock() first - byTimeAndVolume.RLock() + byTime.RLock() startNano := xtime.ToUnixNano(start) - seekersAndBloom, ok := byTimeAndVolume.seekers[startNano][volume] - byTimeAndVolume.RUnlock() + seekersAndBloom, ok := byTime.seekers[startNano] + byTime.RUnlock() if ok && seekersAndBloom.wg == nil { return seekersAndBloom.bloomFilter, nil } - byTimeAndVolume.Lock() - seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, volume, byTimeAndVolume) - byTimeAndVolume.Unlock() + byTime.Lock() + seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, byTime) + byTime.Unlock() return seekersAndBloom.bloomFilter, err } -func (m *seekerManager) Borrow( - shard uint32, - start time.Time, - volume int, -) (ConcurrentDataFileSetSeeker, error) { - byTimeAndVolume := m.seekersByTimeAndVolume(shard) +func (m *seekerManager) Borrow(shard uint32, start time.Time) (ConcurrentDataFileSetSeeker, error) { + byTime := m.seekersByTime(shard) - byTimeAndVolume.Lock() - defer byTimeAndVolume.Unlock() + byTime.Lock() + defer byTime.Unlock() // Track accessed to precache in open/close loop - byTimeAndVolume.accessed = true + byTime.accessed = true startNano := xtime.ToUnixNano(start) - seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, volume, byTimeAndVolume) + seekersAndBloom, err := m.getOrOpenSeekersWithLock(startNano, byTime) if err != nil { return nil, err } @@ -286,19 +252,14 @@ func (m *seekerManager) Borrow( return availableSeeker.seeker, nil } -func (m *seekerManager) Return( - shard uint32, - start time.Time, - volume int, - seeker ConcurrentDataFileSetSeeker, -) error { - byTimeAndVolume := m.seekersByTimeAndVolume(shard) +func (m *seekerManager) Return(shard uint32, start time.Time, seeker ConcurrentDataFileSetSeeker) error { + byTime := m.seekersByTime(shard) - byTimeAndVolume.Lock() - defer byTimeAndVolume.Unlock() + byTime.Lock() + defer byTime.Unlock() startNano := xtime.ToUnixNano(start) - seekersAndBloom, ok := byTimeAndVolume.seekers[startNano][volume] + seekersAndBloom, ok := byTime.seekers[startNano] // Should never happen - This either means that the caller (DataBlockRetriever) is trying to return seekers // that it never requested, OR its trying to return seekers after the openCloseLoop has already // determined that they were all no longer in use and safe to close. Either way it indicates there is @@ -337,7 +298,7 @@ func (m *seekerManager) UpdateOpenLease( } // getOrOpenSeekersWithLock checks if the seekers are already open / initialized. If they are, then it -// returns them. Then, it checks if a different goroutine is in the process of opening them, if so it +// returns them. Then, it checks if a different goroutine is in the process of opening them , if so it // registers itself as waiting until the other goroutine completes. If neither of those conditions occur, // then it begins the process of opening the seekers itself. First, it creates a waitgroup that other // goroutines can use so that they're notified when the seekers are open. This is useful because it allows @@ -345,12 +306,8 @@ func (m *seekerManager) UpdateOpenLease( // of the seekersByTime struct during a I/O heavy workload. Once the wg is created, we relinquish the lock, // open the Seeker (I/O heavy), re-acquire the lock (so that the waiting goroutines don't get it before us), // and then notify the waiting goroutines that we've finished. -func (m *seekerManager) getOrOpenSeekersWithLock( - start xtime.UnixNano, - volume int, - byTimeAndVolume *seekersByTimeAndVolume, -) (seekersAndBloom, error) { - seekers, ok := byTimeAndVolume.seekers[start][volume] +func (m *seekerManager) getOrOpenSeekersWithLock(start xtime.UnixNano, byTime *seekersByTime) (seekersAndBloom, error) { + seekers, ok := byTime.seekers[start] if ok && seekers.wg == nil { // Seekers are already open return seekers, nil @@ -358,11 +315,11 @@ func (m *seekerManager) getOrOpenSeekersWithLock( if seekers.wg != nil { // Seekers are being initialized / opened, wait for the that to complete - byTimeAndVolume.Unlock() + byTime.Unlock() seekers.wg.Wait() - byTimeAndVolume.Lock() + byTime.Lock() // Need to do the lookup again recursively to see the new state - return m.getOrOpenSeekersWithLock(start, volume, byTimeAndVolume) + return m.getOrOpenSeekersWithLock(start, byTime) } // Seekers need to be opened @@ -373,21 +330,21 @@ func (m *seekerManager) getOrOpenSeekersWithLock( wg := &sync.WaitGroup{} seekers.wg = wg seekers.wg.Add(1) - byTimeAndVolume.seekers[start][volume] = seekers - byTimeAndVolume.Unlock() + byTime.seekers[start] = seekers + byTime.Unlock() // Open first one - Do this outside the context of the lock because opening // a seeker can be an expensive operation (validating index files) - seeker, err := m.newOpenSeekerFn(byTimeAndVolume.shard, start.ToTime(), volume) + seeker, err := m.newOpenSeekerFn(byTime.shard, start.ToTime()) // Immediately re-lock once the seeker is open regardless of errors because // thats the contract of this function - byTimeAndVolume.Lock() + byTime.Lock() // Call done after we re-acquire the lock so that callers who were waiting // won't get the lock before us. wg.Done() if err != nil { - // Delete the seekersByTimeAndVolume struct so that the process can be restarted if necessary - byTimeAndVolume.remove(start, volume) + // Delete the seekersByTime struct so that the process can be restarted if necessary + delete(byTime.seekers, start) return seekersAndBloom{}, err } @@ -403,7 +360,7 @@ func (m *seekerManager) getOrOpenSeekersWithLock( multiErr = multiErr.Add(seeker.seeker.Close()) } // Delete the seekersByTime struct so that the process can be restarted if necessary - byTimeAndVolume.remove(start, volume) + delete(byTime.seekers, start) return seekersAndBloom{}, multiErr.FinalError() } borrowableSeekers = append(borrowableSeekers, borrowableSeeker{seeker: clone}) @@ -414,20 +371,20 @@ func (m *seekerManager) getOrOpenSeekersWithLock( // Doesn't matter which seeker we pick to grab the bloom filter from, they all share the same underlying one. // Use index 0 because its guaranteed to be there. seekers.bloomFilter = borrowableSeekers[0].seeker.ConcurrentIDBloomFilter() - byTimeAndVolume.seekers[start][volume] = seekers + byTime.seekers[start] = seekers return seekers, nil } -func (m *seekerManager) openAnyUnopenSeekers(byTimeAndVolume *seekersByTimeAndVolume) error { +func (m *seekerManager) openAnyUnopenSeekers(byTime *seekersByTime) error { start := m.earliestSeekableBlockStart() end := m.latestSeekableBlockStart() blockSize := m.namespaceMetadata.Options().RetentionOptions().BlockSize() multiErr := xerrors.NewMultiError() for t := start; !t.After(end); t = t.Add(blockSize) { - byTimeAndVolume.Lock() - _, err := m.getOrOpenSeekersWithLock(xtime.ToUnixNano(t), byTimeAndVolume) - byTimeAndVolume.Unlock() + byTime.Lock() + _, err := m.getOrOpenSeekersWithLock(xtime.ToUnixNano(t), byTime) + byTime.Unlock() if err != nil && err != errSeekerManagerFileSetNotFound { multiErr = multiErr.Add(err) } @@ -439,7 +396,6 @@ func (m *seekerManager) openAnyUnopenSeekers(byTimeAndVolume *seekersByTimeAndVo func (m *seekerManager) newOpenSeeker( shard uint32, blockStart time.Time, - volume int, ) (DataFileSetSeeker, error) { exists, err := DataFileSetExistsAt(m.filePathPrefix, m.namespace, shard, blockStart) if err != nil { @@ -495,12 +451,12 @@ func (m *seekerManager) newOpenSeeker( return seeker, nil } -func (m *seekerManager) seekersByTimeAndVolume(shard uint32) *seekersByTimeAndVolume { +func (m *seekerManager) seekersByTime(shard uint32) *seekersByTime { m.RLock() if int(shard) < len(m.seekersByShardIdx) { - byTimeAndVolume := m.seekersByShardIdx[shard] + byTime := m.seekersByShardIdx[shard] m.RUnlock() - return byTimeAndVolume + return byTime } m.RUnlock() @@ -509,27 +465,27 @@ func (m *seekerManager) seekersByTimeAndVolume(shard uint32) *seekersByTimeAndVo // Check if raced with another call to this method if int(shard) < len(m.seekersByShardIdx) { - byTimeAndVolume := m.seekersByShardIdx[shard] - return byTimeAndVolume + byTime := m.seekersByShardIdx[shard] + return byTime } - seekersByShardIdx := make([]*seekersByTimeAndVolume, shard+1) + seekersByShardIdx := make([]*seekersByTime, shard+1) for i := range seekersByShardIdx { if i < len(m.seekersByShardIdx) { seekersByShardIdx[i] = m.seekersByShardIdx[i] continue } - seekersByShardIdx[i] = &seekersByTimeAndVolume{ + seekersByShardIdx[i] = &seekersByTime{ shard: uint32(i), - seekers: make(map[xtime.UnixNano]map[int]seekersAndBloom), + seekers: make(map[xtime.UnixNano]seekersAndBloom), } } m.seekersByShardIdx = seekersByShardIdx - byTimeAndVolume := m.seekersByShardIdx[shard] + byTime := m.seekersByShardIdx[shard] - return byTimeAndVolume + return byTime } func (m *seekerManager) Close() error { @@ -542,20 +498,18 @@ func (m *seekerManager) Close() error { // Make sure all seekers are returned before allowing the SeekerManager to be closed. // Actual cleanup of the seekers themselves will be handled by the openCloseLoop. - for _, byTimeAndVolume := range m.seekersByShardIdx { - byTimeAndVolume.Lock() - for _, seekersByTimeAndVolume := range byTimeAndVolume.seekers { - for _, seekersByVolume := range seekersByTimeAndVolume { - for _, seeker := range seekersByVolume.seekers { - if seeker.isBorrowed { - byTimeAndVolume.Unlock() - m.Unlock() - return errCantCloseSeekerManagerWhileSeekersAreBorrowed - } + for _, byTime := range m.seekersByShardIdx { + byTime.Lock() + for _, seekersByTime := range byTime.seekers { + for _, seeker := range seekersByTime.seekers { + if seeker.isBorrowed { + byTime.Unlock() + m.Unlock() + return errCantCloseSeekerManagerWhileSeekersAreBorrowed } } } - byTimeAndVolume.Unlock() + byTime.Unlock() } m.status = seekerManagerClosed @@ -592,7 +546,7 @@ func (m *seekerManager) latestSeekableBlockStart() time.Time { func (m *seekerManager) openCloseLoop() { var ( - shouldTryOpen []*seekersByTimeAndVolume + shouldTryOpen []*seekersByTime shouldClose []seekerManagerPendingClose closing []borrowableSeeker ) @@ -621,26 +575,26 @@ func (m *seekerManager) openCloseLoop() { break } - for _, byTimeAndVolume := range m.seekersByShardIdx { - byTimeAndVolume.RLock() - accessed := byTimeAndVolume.accessed - byTimeAndVolume.RUnlock() + for _, byTime := range m.seekersByShardIdx { + byTime.RLock() + accessed := byTime.accessed + byTime.RUnlock() if !accessed { continue } - shouldTryOpen = append(shouldTryOpen, byTimeAndVolume) + shouldTryOpen = append(shouldTryOpen, byTime) } m.RUnlock() // Try opening any unopened times for accessed seekers - for _, byTimeAndVolume := range shouldTryOpen { - m.openAnyUnopenSeekersFn(byTimeAndVolume) + for _, byTime := range shouldTryOpen { + m.openAnyUnopenSeekersFn(byTime) } m.RLock() - for shard, byTimeAndVolume := range m.seekersByShardIdx { - byTimeAndVolume.RLock() - for blockStartNano := range byTimeAndVolume.seekers { + for shard, byTime := range m.seekersByShardIdx { + byTime.RLock() + for blockStartNano := range byTime.seekers { blockStart := blockStartNano.ToTime() if blockStart.Before(earliestSeekableBlockStart) { shouldClose = append(shouldClose, seekerManagerPendingClose{ @@ -649,32 +603,30 @@ func (m *seekerManager) openCloseLoop() { }) } } - byTimeAndVolume.RUnlock() + byTime.RUnlock() } if len(shouldClose) > 0 { for _, elem := range shouldClose { - byTimeAndVolume := m.seekersByShardIdx[elem.shard] + byTime := m.seekersByShardIdx[elem.shard] blockStartNano := xtime.ToUnixNano(elem.blockStart) - byTimeAndVolume.Lock() - byVolume := byTimeAndVolume.seekers[blockStartNano] - for vol, seekersAndBloom := range byVolume { - allSeekersAreReturned := true - for _, seeker := range seekersAndBloom.seekers { - if seeker.isBorrowed { - allSeekersAreReturned = false - break - } - } - // Never close seekers unless they've all been returned because - // some of them are clones of the original and can't be used once - // the parent is closed (because they share underlying resources) - if allSeekersAreReturned { - closing = append(closing, seekersAndBloom.seekers...) - byTimeAndVolume.remove(xtime.ToUnixNano(elem.blockStart), vol) + byTime.Lock() + seekersAndBloom := byTime.seekers[blockStartNano] + allSeekersAreReturned := true + for _, seeker := range seekersAndBloom.seekers { + if seeker.isBorrowed { + allSeekersAreReturned = false + break } } - byTimeAndVolume.Unlock() + // Never close seekers unless they've all been returned because + // some of them are clones of the original and can't be used once + // the parent is closed (because they share underlying resources) + if allSeekersAreReturned { + closing = append(closing, seekersAndBloom.seekers...) + delete(byTime.seekers, blockStartNano) + } + byTime.Unlock() } } m.RUnlock() @@ -694,22 +646,20 @@ func (m *seekerManager) openCloseLoop() { // Release all resources m.Lock() - for _, byTimeAndVolume := range m.seekersByShardIdx { - byTimeAndVolume.Lock() - for _, seekersByTimeAndVolume := range byTimeAndVolume.seekers { - for _, seekersByVolume := range seekersByTimeAndVolume { - for _, seeker := range seekersByVolume.seekers { - // We don't need to check if the seeker is borrowed here because we don't allow the - // SeekerManager to be closed if any seekers are still outstanding. - err := seeker.seeker.Close() - if err != nil { - m.logger.Error("err closing seeker in SeekerManager at end of openCloseLoop", zap.Error(err)) - } + for _, byTime := range m.seekersByShardIdx { + byTime.Lock() + for _, seekersByTime := range byTime.seekers { + for _, seeker := range seekersByTime.seekers { + // We don't need to check if the seeker is borrowed here because we don't allow the + // SeekerManager to be closed if any seekers are still outstanding. + err := seeker.seeker.Close() + if err != nil { + m.logger.Error("err closing seeker in SeekerManager at end of openCloseLoop", zap.Error(err)) } } } - byTimeAndVolume.seekers = nil - byTimeAndVolume.Unlock() + byTime.seekers = nil + byTime.Unlock() } m.seekersByShardIdx = nil m.Unlock() diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go index 068f697746..e4514927e1 100644 --- a/src/dbnode/persist/fs/types.go +++ b/src/dbnode/persist/fs/types.go @@ -173,7 +173,6 @@ type DataFileSetSeeker interface { namespace ident.ID, shard uint32, start time.Time, - volume int, resources ReusableSeekerResources, ) error @@ -239,14 +238,14 @@ type DataFileSetSeekerManager interface { CacheShardIndices(shards []uint32) error // Borrow returns an open seeker for a given shard, block start time, and volume. - Borrow(shard uint32, start time.Time, volume int) (ConcurrentDataFileSetSeeker, error) + Borrow(shard uint32, start time.Time) (ConcurrentDataFileSetSeeker, error) // Return returns an open seeker for a given shard, block start time, and volume. - Return(shard uint32, start time.Time, volume int, seeker ConcurrentDataFileSetSeeker) error + Return(shard uint32, start time.Time, seeker ConcurrentDataFileSetSeeker) error // ConcurrentIDBloomFilter returns a concurrent ID bloom filter for a given // shard, block start time, and volume. - ConcurrentIDBloomFilter(shard uint32, start time.Time, volume int) (*ManagedConcurrentBloomFilter, error) + ConcurrentIDBloomFilter(shard uint32, start time.Time) (*ManagedConcurrentBloomFilter, error) } // DataBlockRetriever provides a block retriever for TSDB file sets diff --git a/src/dbnode/persist/fs/write.go b/src/dbnode/persist/fs/write.go index 622ca8e1af..b2b3bd944b 100644 --- a/src/dbnode/persist/fs/write.go +++ b/src/dbnode/persist/fs/write.go @@ -191,13 +191,13 @@ func (w *writer) Open(opts DataWriterOpenOptions) error { return err } - w.checkpointFilePath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, checkpointFileSuffix) - infoFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, infoFileSuffix) - indexFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, indexFileSuffix) - summariesFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, summariesFileSuffix) - bloomFilterFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, bloomFilterFileSuffix) - dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, dataFileSuffix) - digestFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, digestFileSuffix) + w.checkpointFilePath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, checkpointFileSuffix, false) + infoFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, infoFileSuffix, false) + indexFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, indexFileSuffix, false) + summariesFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, summariesFileSuffix, false) + bloomFilterFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, bloomFilterFileSuffix, false) + dataFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, dataFileSuffix, false) + digestFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, nextVolumeIndex, digestFileSuffix, false) default: return fmt.Errorf("unable to open reader with fileset type: %s", opts.FileSetType) } diff --git a/src/dbnode/persist/types.go b/src/dbnode/persist/types.go index 4cd178f69f..4912883225 100644 --- a/src/dbnode/persist/types.go +++ b/src/dbnode/persist/types.go @@ -136,19 +136,16 @@ type DataPrepareOptions struct { NamespaceMetadata namespace.Metadata BlockStart time.Time Shard uint32 - FileSetType FileSetType - DeleteIfExists bool + // This volume index is only used when preparing for a flush fileset type. + // When opening a snapshot, the new volume index is determined by looking + // at what files exist on disk. + VolumeIndex int + FileSetType FileSetType + DeleteIfExists bool // Snapshot options are applicable to snapshots (index yes, data yes) Snapshot DataPrepareSnapshotOptions } -// DataPrepareVolumeOptions is the options struct for the prepare method that contains -// information specific to read/writing filesets that have multiple volumes (such as -// snapshots and index file sets). -type DataPrepareVolumeOptions struct { - VolumeIndex int -} - // IndexPrepareOptions is the options struct for the IndexFlush's Prepare method. // nolint: maligned type IndexPrepareOptions struct { diff --git a/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go b/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go index 79ff385f92..5903c0d38a 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go @@ -386,6 +386,14 @@ func (s *peersSource) flush( FileSetType: persistConfig.FileSetType, Shard: shard, BlockStart: start, + // When bootstrapping, the volume index will always be 0. However, + // if we want to be able to snapshot and flush while bootstrapping, + // this may not be the case, e.g. if a flush occurs before a + // bootstrap, then the bootstrap volume index will be >0. In order + // to support this, bootstrapping code will need to incorporate + // merging logic and flush version/volume index will need to be + // synchronized between processes. + VolumeIndex: 0, // If we've peer bootstrapped this shard/block combination AND the fileset // already exists on disk, then that means either: // 1) The Filesystem bootstrapper was unable to bootstrap the fileset diff --git a/src/dbnode/storage/cleanup.go b/src/dbnode/storage/cleanup.go index e5f0daf325..00cf1a418f 100644 --- a/src/dbnode/storage/cleanup.go +++ b/src/dbnode/storage/cleanup.go @@ -134,7 +134,7 @@ func (m *cleanupManager) Cleanup(t time.Time) error { }() multiErr := xerrors.NewMultiError() - if err := m.cleanupExpiredDataFiles(t); err != nil { + if err := m.cleanupDataFiles(t); err != nil { multiErr = multiErr.Add(fmt.Errorf( "encountered errors when cleaning up data files for %v: %v", t, err)) } @@ -227,7 +227,7 @@ func (m *cleanupManager) deleteInactiveDataFileSetFiles(filesetFilesDirPathFn fu return multiErr.FinalError() } -func (m *cleanupManager) cleanupExpiredDataFiles(t time.Time) error { +func (m *cleanupManager) cleanupDataFiles(t time.Time) error { multiErr := xerrors.NewMultiError() namespaces, err := m.database.GetOwnedNamespaces() if err != nil { @@ -240,6 +240,7 @@ func (m *cleanupManager) cleanupExpiredDataFiles(t time.Time) error { earliestToRetain := retention.FlushTimeStart(n.Options().RetentionOptions(), t) shards := n.GetOwnedShards() multiErr = multiErr.Add(m.cleanupExpiredNamespaceDataFiles(earliestToRetain, shards)) + multiErr = multiErr.Add(m.cleanupCompactedNamespaceDataFiles(shards)) } return multiErr.FinalError() } @@ -275,6 +276,17 @@ func (m *cleanupManager) cleanupExpiredNamespaceDataFiles(earliestToRetain time. return multiErr.FinalError() } +func (m *cleanupManager) cleanupCompactedNamespaceDataFiles(shards []databaseShard) error { + multiErr := xerrors.NewMultiError() + for _, shard := range shards { + if err := shard.CleanupCompactedFileSets(); err != nil { + multiErr = multiErr.Add(err) + } + } + + return multiErr.FinalError() +} + // The goal of the cleanupSnapshotsAndCommitlogs function is to delete all snapshots files, snapshot metadata // files, and commitlog files except for those that are currently required for recovery from a node failure. // According to the snapshotting / commitlog rotation logic, the files that are required for a complete diff --git a/src/dbnode/storage/cleanup_test.go b/src/dbnode/storage/cleanup_test.go index 7b5729b92b..cdecdf9008 100644 --- a/src/dbnode/storage/cleanup_test.go +++ b/src/dbnode/storage/cleanup_test.go @@ -286,6 +286,8 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { shard := NewMockdatabaseShard(ctrl) shard.EXPECT().ID().Return(uint32(i)).AnyTimes() shard.EXPECT().CleanupExpiredFileSets(gomock.Any()).Return(nil).AnyTimes() + shard.EXPECT().CleanupCompactedFileSets().Return(nil).AnyTimes() + shards = append(shards, shard) } @@ -405,6 +407,7 @@ func TestCleanupDataAndSnapshotFileSetFiles(t *testing.T) { shard := NewMockdatabaseShard(ctrl) expectedEarliestToRetain := retention.FlushTimeStart(ns.Options().RetentionOptions(), ts) shard.EXPECT().CleanupExpiredFileSets(expectedEarliestToRetain).Return(nil) + shard.EXPECT().CleanupCompactedFileSets().Return(nil) shard.EXPECT().ID().Return(uint32(0)).AnyTimes() ns.EXPECT().GetOwnedShards().Return([]databaseShard{shard}).AnyTimes() ns.EXPECT().ID().Return(ident.StringID("nsID")).AnyTimes() diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go index 4e0a60dfde..c01764529a 100644 --- a/src/dbnode/storage/index.go +++ b/src/dbnode/storage/index.go @@ -277,7 +277,7 @@ func newNamespaceIndexWithOptions( bufferFuture: nsMD.Options().RetentionOptions().BufferFuture(), coldWritesEnabled: nsMD.Options().ColdWritesEnabled(), - indexFilesetsBeforeFn: fs.IndexFileSetsBefore, + indexFilesetsBeforeFn: fs.IndexFilePathsBefore, deleteFilesFn: fs.DeleteFiles, newBlockFn: newBlockFn, diff --git a/src/dbnode/storage/namespace_readers.go b/src/dbnode/storage/namespace_readers.go index 3d0e6ffae3..b4d97eede2 100644 --- a/src/dbnode/storage/namespace_readers.go +++ b/src/dbnode/storage/namespace_readers.go @@ -21,6 +21,7 @@ package storage import ( + "fmt" "sync" "time" @@ -66,7 +67,6 @@ type databaseNamespaceReaderManager interface { get( shard uint32, blockStart time.Time, - volume int, position readerPosition, ) (fs.DataFileSetReader, error) @@ -234,14 +234,12 @@ func (m *namespaceReaderManager) cachedReaderForKey( func (m *namespaceReaderManager) get( shard uint32, blockStart time.Time, - volume int, position readerPosition, ) (fs.DataFileSetReader, error) { key := cachedOpenReaderKey{ shard: shard, blockStart: xtime.ToUnixNano(blockStart), position: position, - volume: volume, } lookup, err := m.cachedReaderForKey(key) @@ -257,13 +255,13 @@ func (m *namespaceReaderManager) get( reader := lookup.closedReader openOpts := fs.DataReaderOpenOptions{ Identifier: fs.FileSetFileIdentifier{ - Namespace: m.namespace.ID(), - Shard: shard, - BlockStart: blockStart, - VolumeIndex: volume, + Namespace: m.namespace.ID(), + Shard: shard, + BlockStart: blockStart, }, } if err := reader.Open(openOpts); err != nil { + fmt.Printf("j>> %+v\n", 2) return nil, err } diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index 5d5303e8c1..475c3187b7 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -73,7 +73,13 @@ var ( errNewShardEntryTagsIterNotAtIndexZero = errors.New("new shard entry options error: tags iter not at index zero") ) -type filesetBeforeFn func( +type filesetsFn func( + filePathPrefix string, + namespace ident.ID, + shardID uint32, +) (fs.FileSetFilesSlice, error) + +type filesetPathsBeforeFn func( filePathPrefix string, namespace ident.ID, shardID uint32, @@ -150,7 +156,8 @@ type dbShard struct { bootstrapState BootstrapState newMergerFn fs.NewMergerFn newFSMergeWithMemFn newFSMergeWithMemFn - filesetBeforeFn filesetBeforeFn + filesetsFn filesetsFn + filesetPathsBeforeFn filesetPathsBeforeFn deleteFilesFn deleteFilesFn snapshotFilesFn snapshotFilesFn sleepFn func(time.Duration) @@ -247,30 +254,31 @@ func newDatabaseShard( SubScope("dbshard") s := &dbShard{ - opts: opts, - seriesOpts: seriesOpts, - nowFn: opts.ClockOptions().NowFn(), - state: dbShardStateOpen, - namespace: namespaceMetadata, - shard: shard, - namespaceReaderMgr: namespaceReaderMgr, - increasingIndex: increasingIndex, - seriesPool: opts.DatabaseSeriesPool(), - reverseIndex: reverseIndex, - lookup: newShardMap(shardMapOptions{}), - list: list.New(), - newMergerFn: fs.NewMerger, - newFSMergeWithMemFn: newFSMergeWithMem, - filesetBeforeFn: fs.DataFileSetsBefore, - deleteFilesFn: fs.DeleteFiles, - snapshotFilesFn: fs.SnapshotFiles, - sleepFn: time.Sleep, - identifierPool: opts.IdentifierPool(), - contextPool: opts.ContextPool(), - flushState: newShardFlushState(), - tickWg: &sync.WaitGroup{}, - logger: opts.InstrumentOptions().Logger(), - metrics: newDatabaseShardMetrics(shard, scope), + opts: opts, + seriesOpts: seriesOpts, + nowFn: opts.ClockOptions().NowFn(), + state: dbShardStateOpen, + namespace: namespaceMetadata, + shard: shard, + namespaceReaderMgr: namespaceReaderMgr, + increasingIndex: increasingIndex, + seriesPool: opts.DatabaseSeriesPool(), + reverseIndex: reverseIndex, + lookup: newShardMap(shardMapOptions{}), + list: list.New(), + newMergerFn: fs.NewMerger, + newFSMergeWithMemFn: newFSMergeWithMem, + filesetsFn: fs.DataFiles, + filesetPathsBeforeFn: fs.DataFilePathsBefore, + deleteFilesFn: fs.DeleteFiles, + snapshotFilesFn: fs.SnapshotFiles, + sleepFn: time.Sleep, + identifierPool: opts.IdentifierPool(), + contextPool: opts.ContextPool(), + flushState: newShardFlushState(), + tickWg: &sync.WaitGroup{}, + logger: opts.InstrumentOptions().Logger(), + metrics: newDatabaseShardMetrics(shard, scope), } s.insertQueue = newDatabaseShardInsertQueue(s.insertSeriesBatch, s.nowFn, scope) @@ -1687,10 +1695,8 @@ func (s *dbShard) FetchBlocksMetadataV2( pos.metadataIdx = int(flushedPhase.CurrBlockEntryIdx) } - // TODO(juchan): actually get the volume - vol := 0 // Open a reader at this position, potentially from cache - reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, vol, pos) + reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, pos) if err != nil { return nil, nil, err } @@ -1892,6 +1898,9 @@ func (s *dbShard) WarmFlush( NamespaceMetadata: s.namespace, Shard: s.ID(), BlockStart: blockStart, + // Volume index is always 0 for warm flushes because a warm flush must + // happen first before cold flushes happen. + VolumeIndex: 0, // We explicitly set delete if exists to false here as we track which // filesets exists at bootstrap time so we should never encounter a time // when we attempt to flush and a fileset already exists unless there is @@ -1994,10 +2003,12 @@ func (s *dbShard) ColdFlush( // a block, we continue to try persisting other blocks. for blockStart := range dirtySeriesToWrite { startTime := blockStart.ToTime() + coldVersion := s.RetrievableBlockColdVersion(startTime) fsID := fs.FileSetFileIdentifier{ - Namespace: s.namespace.ID(), - Shard: s.ID(), - BlockStart: startTime, + Namespace: s.namespace.ID(), + Shard: s.ID(), + BlockStart: startTime, + VolumeIndex: coldVersion, } err := merger.Merge(fsID, mergeWithMem, flushPreparer, nsCtx) @@ -2008,7 +2019,7 @@ func (s *dbShard) ColdFlush( // After writing the full block successfully, update the cold version // in the flush state. - nextVersion := s.RetrievableBlockColdVersion(startTime) + 1 + nextVersion := coldVersion + 1 // Once this function completes block leasers will no longer be able to acquire // leases on previous volumes for the given namespace/shard/blockstart. @@ -2158,18 +2169,36 @@ func (s *dbShard) removeAnyFlushStatesTooEarly(tickStart time.Time) { func (s *dbShard) CleanupExpiredFileSets(earliestToRetain time.Time) error { filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() - multiErr := xerrors.NewMultiError() - expired, err := s.filesetBeforeFn(filePathPrefix, s.namespace.ID(), s.ID(), earliestToRetain) + expired, err := s.filesetPathsBeforeFn(filePathPrefix, s.namespace.ID(), s.ID(), earliestToRetain) if err != nil { - detailedErr := - fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v", - filePathPrefix, s.namespace.ID(), s.ID(), err) - multiErr = multiErr.Add(detailedErr) + return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v", + filePathPrefix, s.namespace.ID(), s.ID(), err) } - if err := s.deleteFilesFn(expired); err != nil { - multiErr = multiErr.Add(err) + + return s.deleteFilesFn(expired) +} + +func (s *dbShard) CleanupCompactedFileSets() error { + filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() + filesets, err := s.filesetsFn(filePathPrefix, s.namespace.ID(), s.ID()) + if err != nil { + return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v", + filePathPrefix, s.namespace.ID(), s.ID(), err) } - return multiErr.FinalError() + // Get a snapshot of all states here to prevent constantly getting/releasing + // locks in a tight loop below. + blockStates := s.BlockStatesSnapshot() + // Filter without allocating by using same backing array. + compacted := filesets[:0] + for _, datafile := range filesets { + fileID := datafile.ID + blockState := blockStates[xtime.ToUnixNano(fileID.BlockStart)] + if fileID.VolumeIndex < blockState.ColdVersion { + compacted = append(compacted, datafile) + } + } + + return s.deleteFilesFn(compacted.Filepaths()) } func (s *dbShard) Repair( diff --git a/src/dbnode/storage/shard_test.go b/src/dbnode/storage/shard_test.go index 00e037def4..28b0a6e6ff 100644 --- a/src/dbnode/storage/shard_test.go +++ b/src/dbnode/storage/shard_test.go @@ -1136,7 +1136,7 @@ func TestShardCleanupExpiredFileSets(t *testing.T) { opts := DefaultTestOptions() shard := testDatabaseShard(t, opts) defer shard.Close() - shard.filesetBeforeFn = func(_ string, namespace ident.ID, shardID uint32, t time.Time) ([]string, error) { + shard.filesetPathsBeforeFn = func(_ string, namespace ident.ID, shardID uint32, t time.Time) ([]string, error) { return []string{namespace.String(), strconv.Itoa(int(shardID))}, nil } var deletedFiles []string diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go index 78e02a3aa4..088c90fbdd 100644 --- a/src/dbnode/storage/types.go +++ b/src/dbnode/storage/types.go @@ -494,6 +494,11 @@ type databaseShard interface { // CleanupExpiredFileSets removes expired fileset files. CleanupExpiredFileSets(earliestToRetain time.Time) error + // CleanupCompactedFileSets removes fileset files that have been compacted, + // meaning that there exists a more recent, superset, fully persisted + // fileset for that block. + CleanupCompactedFileSets() error + // Repair repairs the shard data for a given time. Repair( ctx context.Context, From 82e739d7384b1af4e84519f3010b7d21caa7a586 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Thu, 13 Jun 2019 15:05:49 -0400 Subject: [PATCH 03/12] Propagate volume to seekers too --- .../integration/disk_cleanup_helpers.go | 4 +- src/dbnode/integration/disk_flush_helpers.go | 6 +-- src/dbnode/persist/fs/files.go | 49 ++++++++----------- src/dbnode/persist/fs/files_test.go | 22 ++++----- src/dbnode/persist/fs/fs_mock.go | 8 +-- .../persist/fs/index_lookup_prop_test.go | 4 +- src/dbnode/persist/fs/merger.go | 8 +-- src/dbnode/persist/fs/persist_manager_test.go | 4 +- src/dbnode/persist/fs/read.go | 9 ++-- src/dbnode/persist/fs/read_test.go | 4 +- src/dbnode/persist/fs/seek.go | 28 ++++++++--- src/dbnode/persist/fs/seek_manager.go | 4 +- src/dbnode/persist/fs/seek_manager_test.go | 2 +- src/dbnode/persist/fs/seek_test.go | 16 +++--- src/dbnode/persist/fs/types.go | 1 + src/dbnode/storage/namespace_readers.go | 29 ++++++----- src/dbnode/storage/shard.go | 6 +-- 17 files changed, 110 insertions(+), 94 deletions(-) diff --git a/src/dbnode/integration/disk_cleanup_helpers.go b/src/dbnode/integration/disk_cleanup_helpers.go index 4de994e6e6..235ee220a1 100644 --- a/src/dbnode/integration/disk_cleanup_helpers.go +++ b/src/dbnode/integration/disk_cleanup_helpers.go @@ -27,11 +27,11 @@ import ( "testing" "time" + "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/persist/fs/commitlog" "github.com/m3db/m3/src/dbnode/sharding" "github.com/m3db/m3/src/dbnode/storage" - "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/x/ident" "github.com/stretchr/testify/require" @@ -114,7 +114,7 @@ type cleanupTimesFileSet struct { func (fset *cleanupTimesFileSet) anyExist() bool { for _, t := range fset.times { - exists, err := fs.DataFileSetExistsAt(fset.filePathPrefix, fset.namespace, fset.shard, t) + exists, err := fs.DataFileSetExists(fset.filePathPrefix, fset.namespace, fset.shard, t, 0) if err != nil { panic(err) } diff --git a/src/dbnode/integration/disk_flush_helpers.go b/src/dbnode/integration/disk_flush_helpers.go index 6ef38aca2e..f0104c93a6 100644 --- a/src/dbnode/integration/disk_flush_helpers.go +++ b/src/dbnode/integration/disk_flush_helpers.go @@ -30,11 +30,11 @@ import ( "github.com/m3db/m3/src/dbnode/encoding" "github.com/m3db/m3/src/dbnode/integration/generate" + ns "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/sharding" "github.com/m3db/m3/src/dbnode/storage" - ns "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/x/ident" "github.com/m3db/m3/src/x/ident/testutil" xtime "github.com/m3db/m3/src/x/time" @@ -122,8 +122,8 @@ func waitUntilDataFilesFlushed( for timestamp, seriesList := range testData { for _, series := range seriesList { shard := shardSet.Lookup(series.ID) - exists, err := fs.DataFileSetExistsAt( - filePathPrefix, namespace, shard, timestamp.ToTime()) + exists, err := fs.DataFileSetExists( + filePathPrefix, namespace, shard, timestamp.ToTime(), 0) if err != nil { panic(err) } diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 56dd3b1ca1..7789c35778 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -181,7 +181,7 @@ func (f FileSetFilesSlice) Filepaths() []string { } // LatestVolumeForBlock returns the latest (highest index) FileSetFile in the -// slice for a given block start. +// slice for a given block start that has a complete checkpoint file. func (f FileSetFilesSlice) LatestVolumeForBlock(blockStart time.Time) (FileSetFile, bool) { // Make sure we're already sorted. f.sortByTimeAndVolumeIndexAscending() @@ -445,7 +445,7 @@ func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, er if len(components) == 3 { // Three components mean that there is no index in the filename. - return t, unindexedFilesetIndex, err + return t, unindexedFilesetIndex, nil } if len(components) == 4 { @@ -458,7 +458,7 @@ func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, er return t, int(index), nil } - return timeZero, 0, fmt.Errorf("malformed filename: %s", fname) + return timeZero, 0, fmt.Errorf("malformed data fileset filename: %s", fname) } // TimeAndVolumeIndexFromFileSetFilename extracts the block start and @@ -633,9 +633,12 @@ func forEachInfoFile( t := matched[i].ID.BlockStart volume := matched[i].ID.VolumeIndex - isLegacy, err := isFirstVolumeLegacy(dir, t, checkpointFileSuffix) - if err != nil { - continue + isLegacy := false + if volume == 0 { + isLegacy, err = isFirstVolumeLegacy(dir, t, checkpointFileSuffix) + if err != nil { + continue + } } var ( checkpointFilePath string @@ -1292,21 +1295,6 @@ func CommitLogsDirPath(prefix string) string { return path.Join(prefix, commitLogsDirName) } -// DataFileSetExistsAt determines whether data fileset files exist for the given namespace, shard, and block start. -func DataFileSetExistsAt(filePathPrefix string, namespace ident.ID, shard uint32, blockStart time.Time) (bool, error) { - dataFiles, err := DataFiles(filePathPrefix, namespace, shard) - if err != nil { - return false, err - } - - _, ok := dataFiles.LatestVolumeForBlock(blockStart) - if !ok { - return false, nil - } - - return true, nil -} - // DataFileSetExists determines whether data fileset files exist for the given // namespace, shard, block start, and volume. func DataFileSetExists( @@ -1316,12 +1304,16 @@ func DataFileSetExists( blockStart time.Time, volume int, ) (bool, error) { - dataFiles, err := DataFiles(filePathPrefix, namespace, shard) - if err != nil { - return false, err + shardDir := ShardDataDirPath(filePathPrefix, namespace, shard) + // Check fileset with volume first to optimize for non-legacy use case. + checkpointPath := filesetPathFromTimeAndIndex(shardDir, blockStart, volume, checkpointFileSuffix) + if volume != 0 { + return CompleteCheckpointFileExists(checkpointPath) } - return dataFiles.VolumeExistsForBlock(blockStart, volume), nil + // Check legacy format next only if volume is 0. + checkpointPath = filesetPathFromTimeLegacy(shardDir, blockStart, checkpointFileSuffix) + return CompleteCheckpointFileExists(checkpointPath) } // SnapshotFileSetExistsAt determines whether snapshot fileset files exist for the given namespace, shard, and block start time. @@ -1481,7 +1473,7 @@ func filesetFileForTime(t time.Time, suffix string) string { return fmt.Sprintf("%s%s%d%s%s%s", filesetFilePrefix, separator, t.UnixNano(), separator, suffix, fileSuffix) } -func filesetPathFromTime(prefix string, t time.Time, suffix string) string { +func filesetPathFromTimeLegacy(prefix string, t time.Time, suffix string) string { return path.Join(prefix, filesetFileForTime(t, suffix)) } @@ -1495,13 +1487,14 @@ func filesetPathFromTimeAndIndex(prefix string, t time.Time, index int, suffix s // the existence of the file and the non-legacy version of the file, and returns // and error if neither exist. func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error) { + // Check non-legacy path first to optimize for newer files. path := filesetPathFromTimeAndIndex(prefix, t, 0, suffix) _, err := os.Stat(path) if err == nil { return false, nil } - legacyPath := filesetPathFromTime(prefix, t, suffix) + legacyPath := filesetPathFromTimeLegacy(prefix, t, suffix) _, err = os.Stat(legacyPath) if err == nil { return true, nil @@ -1519,7 +1512,7 @@ func dataFilesetPathFromTimeAndIndex( isLegacy bool, ) string { if isLegacy { - return filesetPathFromTime(prefix, t, suffix) + return filesetPathFromTimeLegacy(prefix, t, suffix) } return filesetPathFromTimeAndIndex(prefix, t, index, suffix) diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index 9837cca30d..91db576886 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -208,7 +208,7 @@ func TestForEachInfoFile(t *testing.T) { res = append(res, data...) }) - require.Equal(t, []string{filesetPathFromTime(shardDir, blockStart, infoFileSuffix)}, fnames) + require.Equal(t, []string{filesetPathFromTimeLegacy(shardDir, blockStart, infoFileSuffix)}, fnames) require.Equal(t, infoData, res) } @@ -391,16 +391,16 @@ func TestFileExists(t *testing.T) { defer os.RemoveAll(dir) require.NoError(t, err) - infoFilePath := filesetPathFromTime(shardDir, start, infoFileSuffix) + infoFilePath := filesetPathFromTimeLegacy(shardDir, start, infoFileSuffix) createDataFile(t, shardDir, start, infoFileSuffix, checkpointFileBuf) require.True(t, mustFileExists(t, infoFilePath)) - exists, err := DataFileSetExistsAt(dir, testNs1ID, uint32(shard), start) + exists, err := DataFileSetExists(dir, testNs1ID, uint32(shard), start, 0) require.NoError(t, err) require.False(t, exists) - checkpointFilePath := filesetPathFromTime(shardDir, start, checkpointFileSuffix) + checkpointFilePath := filesetPathFromTimeLegacy(shardDir, start, checkpointFileSuffix) createDataFile(t, shardDir, start, checkpointFileSuffix, checkpointFileBuf) - exists, err = DataFileSetExistsAt(dir, testNs1ID, uint32(shard), start) + exists, err = DataFileSetExists(dir, testNs1ID, uint32(shard), start, 0) require.NoError(t, err) require.True(t, exists) @@ -421,7 +421,7 @@ func TestCompleteCheckpointFileExists(t *testing.T) { shard = uint32(10) start = time.Now() shardDir = ShardDataDirPath(dir, testNs1ID, shard) - checkpointFilePath = filesetPathFromTime(shardDir, start, checkpointFileSuffix) + checkpointFilePath = filesetPathFromTimeLegacy(shardDir, start, checkpointFileSuffix) err = os.MkdirAll(shardDir, defaultNewDirectoryMode) validCheckpointFileBuf = make([]byte, CheckpointFileSizeBytes) @@ -464,7 +464,7 @@ func TestFilePathFromTime(t *testing.T) { {"foo/bar/", infoFileSuffix, "foo/bar/fileset-1465501321123456789-info.db"}, } for _, input := range inputs { - require.Equal(t, input.expected, filesetPathFromTime(input.prefix, start, input.suffix)) + require.Equal(t, input.expected, filesetPathFromTimeLegacy(input.prefix, start, input.suffix)) } } @@ -482,7 +482,7 @@ func TestFileSetFilesBefore(t *testing.T) { shardDir := path.Join(dir, dataDirName, testNs1ID.String(), strconv.Itoa(int(shard))) for i := 0; i < len(res); i++ { ts := time.Unix(0, int64(i)) - require.Equal(t, filesetPathFromTime(shardDir, ts, infoFileSuffix), res[i]) + require.Equal(t, filesetPathFromTimeLegacy(shardDir, ts, infoFileSuffix), res[i]) } } @@ -1173,7 +1173,7 @@ func createDataFiles(t *testing.T, if isSnapshot { infoFilePath = filesetPathFromTimeAndIndex(shardDir, ts, 0, fileSuffix) } else { - infoFilePath = filesetPathFromTime(shardDir, ts, fileSuffix) + infoFilePath = filesetPathFromTimeLegacy(shardDir, ts, fileSuffix) } var contents []byte if fileSuffix == checkpointFileSuffix { @@ -1230,7 +1230,7 @@ func (filesets fileSetFileIdentifiers) create(t *testing.T, prefixDir string, fi var path string switch fileSetType { case persist.FileSetFlushType: - path = filesetPathFromTime(shardDir, blockStart, suffix) + path = filesetPathFromTimeLegacy(shardDir, blockStart, suffix) writeFile(t, path, nil) case persist.FileSetSnapshotType: path = filesetPathFromTimeAndIndex(shardDir, blockStart, 0, fileSuffix) @@ -1262,7 +1262,7 @@ func (filesets fileSetFileIdentifiers) create(t *testing.T, prefixDir string, fi } func createDataFile(t *testing.T, shardDir string, blockStart time.Time, suffix string, b []byte) { - filePath := filesetPathFromTime(shardDir, blockStart, suffix) + filePath := filesetPathFromTimeLegacy(shardDir, blockStart, suffix) createFile(t, filePath, b) } diff --git a/src/dbnode/persist/fs/fs_mock.go b/src/dbnode/persist/fs/fs_mock.go index e69dd50653..84fd159ba5 100644 --- a/src/dbnode/persist/fs/fs_mock.go +++ b/src/dbnode/persist/fs/fs_mock.go @@ -400,17 +400,17 @@ func (mr *MockDataFileSetSeekerMockRecorder) ConcurrentIDBloomFilter() *gomock.C } // Open mocks base method -func (m *MockDataFileSetSeeker) Open(arg0 ident.ID, arg1 uint32, arg2 time.Time, arg3 ReusableSeekerResources) error { +func (m *MockDataFileSetSeeker) Open(arg0 ident.ID, arg1 uint32, arg2 time.Time, arg3 int, arg4 ReusableSeekerResources) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Open", arg0, arg1, arg2, arg3) + ret := m.ctrl.Call(m, "Open", arg0, arg1, arg2, arg3, arg4) ret0, _ := ret[0].(error) return ret0 } // Open indicates an expected call of Open -func (mr *MockDataFileSetSeekerMockRecorder) Open(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { +func (mr *MockDataFileSetSeekerMockRecorder) Open(arg0, arg1, arg2, arg3, arg4 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Open", reflect.TypeOf((*MockDataFileSetSeeker)(nil).Open), arg0, arg1, arg2, arg3) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Open", reflect.TypeOf((*MockDataFileSetSeeker)(nil).Open), arg0, arg1, arg2, arg3, arg4) } // Range mocks base method diff --git a/src/dbnode/persist/fs/index_lookup_prop_test.go b/src/dbnode/persist/fs/index_lookup_prop_test.go index 2c0e9f2ca5..55b185c9a5 100644 --- a/src/dbnode/persist/fs/index_lookup_prop_test.go +++ b/src/dbnode/persist/fs/index_lookup_prop_test.go @@ -107,7 +107,7 @@ func TestIndexLookupWriteRead(t *testing.T) { } // Read the summaries file into memory - summariesFilePath := filesetPathFromTime( + summariesFilePath := filesetPathFromTimeLegacy( shardDirPath, testWriterStart, summariesFileSuffix) summariesFile, err := os.Open(summariesFilePath) if err != nil { @@ -254,7 +254,7 @@ func genTagIdent() gopter.Gen { } func readIndexFileOffsets(shardDirPath string, numEntries int, start time.Time) (map[string]int64, error) { - indexFilePath := filesetPathFromTime(shardDirPath, start, indexFileSuffix) + indexFilePath := filesetPathFromTimeLegacy(shardDirPath, start, indexFileSuffix) buf, err := ioutil.ReadFile(indexFilePath) if err != nil { return nil, fmt.Errorf("err reading index file: %v, ", err) diff --git a/src/dbnode/persist/fs/merger.go b/src/dbnode/persist/fs/merger.go index 6b3a8e5c95..e9310f9903 100644 --- a/src/dbnode/persist/fs/merger.go +++ b/src/dbnode/persist/fs/merger.go @@ -97,13 +97,15 @@ func (m *merger) Merge( nsID = fileID.Namespace shard = fileID.Shard startTime = fileID.BlockStart + volume = fileID.VolumeIndex blockSize = nsOpts.RetentionOptions().BlockSize() blockStart = xtime.ToUnixNano(startTime) openOpts = DataReaderOpenOptions{ Identifier: FileSetFileIdentifier{ - Namespace: nsID, - Shard: shard, - BlockStart: startTime, + Namespace: nsID, + Shard: shard, + BlockStart: startTime, + VolumeIndex: volume, }, FileSetType: persist.FileSetFlushType, } diff --git a/src/dbnode/persist/fs/persist_manager_test.go b/src/dbnode/persist/fs/persist_manager_test.go index 015952c59b..dcdea6ce54 100644 --- a/src/dbnode/persist/fs/persist_manager_test.go +++ b/src/dbnode/persist/fs/persist_manager_test.go @@ -55,7 +55,7 @@ func TestPersistenceManagerPrepareDataFileExistsNoDelete(t *testing.T) { shard = uint32(0) blockStart = time.Unix(1000, 0) shardDir = createDataShardDir(t, pm.filePathPrefix, testNs1ID, shard) - checkpointFilePath = filesetPathFromTime(shardDir, blockStart, checkpointFileSuffix) + checkpointFilePath = filesetPathFromTimeLegacy(shardDir, blockStart, checkpointFileSuffix) checkpointFileBuf = make([]byte, CheckpointFileSizeBytes) ) createFile(t, checkpointFilePath, checkpointFileBuf) @@ -102,7 +102,7 @@ func TestPersistenceManagerPrepareDataFileExistsWithDelete(t *testing.T) { var ( shardDir = createDataShardDir(t, pm.filePathPrefix, testNs1ID, shard) - checkpointFilePath = filesetPathFromTime(shardDir, blockStart, checkpointFileSuffix) + checkpointFilePath = filesetPathFromTimeLegacy(shardDir, blockStart, checkpointFileSuffix) checkpointFileBuf = make([]byte, CheckpointFileSizeBytes) ) createFile(t, checkpointFilePath, checkpointFileBuf) diff --git a/src/dbnode/persist/fs/read.go b/src/dbnode/persist/fs/read.go index 863fb498a6..04815deb5f 100644 --- a/src/dbnode/persist/fs/read.go +++ b/src/dbnode/persist/fs/read.go @@ -146,9 +146,12 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { dataFilepath string ) - isLegacy, err := isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) - if err != nil { - return err + isLegacy := false + if volumeIndex == 0 { + isLegacy, err = isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) + if err != nil { + return err + } } switch opts.FileSetType { case persist.FileSetSnapshotType: diff --git a/src/dbnode/persist/fs/read_test.go b/src/dbnode/persist/fs/read_test.go index 59790ec8fe..52c504ce74 100644 --- a/src/dbnode/persist/fs/read_test.go +++ b/src/dbnode/persist/fs/read_test.go @@ -271,7 +271,7 @@ func TestReadNoCheckpointFile(t *testing.T) { var ( shardDir = ShardDataDirPath(filePathPrefix, testNs1ID, shard) - checkpointFile = filesetPathFromTime(shardDir, testWriterStart, checkpointFileSuffix) + checkpointFile = filesetPathFromTimeLegacy(shardDir, testWriterStart, checkpointFileSuffix) ) exists, err := CompleteCheckpointFileExists(checkpointFile) require.NoError(t, err) @@ -316,7 +316,7 @@ func testReadOpen(t *testing.T, fileData map[string][]byte) { assert.NoError(t, w.Close()) for suffix, data := range fileData { - digestFile := filesetPathFromTime(shardDir, start, suffix) + digestFile := filesetPathFromTimeLegacy(shardDir, start, suffix) fd, err := os.OpenFile(digestFile, os.O_WRONLY|os.O_TRUNC, os.FileMode(0666)) require.NoError(t, err) _, err = fd.Write(data) diff --git a/src/dbnode/persist/fs/seek.go b/src/dbnode/persist/fs/seek.go index 7dc0561dfc..065a730295 100644 --- a/src/dbnode/persist/fs/seek.go +++ b/src/dbnode/persist/fs/seek.go @@ -145,6 +145,7 @@ func (s *seeker) Open( namespace ident.ID, shard uint32, blockStart time.Time, + volumeIndex int, resources ReusableSeekerResources, ) error { if s.isClone { @@ -152,16 +153,27 @@ func (s *seeker) Open( } shardDir := ShardDataDirPath(s.opts.filePathPrefix, namespace, shard) - var infoFd, digestFd, bloomFilterFd, summariesFd *os.File + var ( + infoFd, digestFd, bloomFilterFd, summariesFd *os.File + err error + isLegacy bool + ) + + if volumeIndex == 0 { + isLegacy, err = isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) + if err != nil { + return err + } + } // Open necessary files if err := openFiles(os.Open, map[string]**os.File{ - filesetPathFromTime(shardDir, blockStart, infoFileSuffix): &infoFd, - filesetPathFromTime(shardDir, blockStart, indexFileSuffix): &s.indexFd, - filesetPathFromTime(shardDir, blockStart, dataFileSuffix): &s.dataFd, - filesetPathFromTime(shardDir, blockStart, digestFileSuffix): &digestFd, - filesetPathFromTime(shardDir, blockStart, bloomFilterFileSuffix): &bloomFilterFd, - filesetPathFromTime(shardDir, blockStart, summariesFileSuffix): &summariesFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix, isLegacy): &infoFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix, isLegacy): &s.indexFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix, isLegacy): &s.dataFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix, isLegacy): &digestFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix, isLegacy): &bloomFilterFd, + dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, summariesFileSuffix, isLegacy): &summariesFd, }); err != nil { return err } @@ -219,7 +231,7 @@ func (s *seeker) Open( s.Close() return fmt.Errorf( "index file digest for file: %s does not match the expected digest: %c", - filesetPathFromTime(shardDir, blockStart, indexFileSuffix), err, + filesetPathFromTimeLegacy(shardDir, blockStart, indexFileSuffix), err, ) } diff --git a/src/dbnode/persist/fs/seek_manager.go b/src/dbnode/persist/fs/seek_manager.go index 85b3650da6..664e00ec85 100644 --- a/src/dbnode/persist/fs/seek_manager.go +++ b/src/dbnode/persist/fs/seek_manager.go @@ -397,7 +397,9 @@ func (m *seekerManager) newOpenSeeker( shard uint32, blockStart time.Time, ) (DataFileSetSeeker, error) { - exists, err := DataFileSetExistsAt(m.filePathPrefix, m.namespace, shard, blockStart) + // TODO(juchan): get the actual volume here. + vol := 0 + exists, err := DataFileSetExists(m.filePathPrefix, m.namespace, shard, blockStart, vol) if err != nil { return nil, err } diff --git a/src/dbnode/persist/fs/seek_manager_test.go b/src/dbnode/persist/fs/seek_manager_test.go index eb2bbb3f83..ef04c05bfd 100644 --- a/src/dbnode/persist/fs/seek_manager_test.go +++ b/src/dbnode/persist/fs/seek_manager_test.go @@ -87,7 +87,7 @@ func TestSeekerManagerBorrowOpenSeekersLazy(t *testing.T) { blockStart time.Time, ) (DataFileSetSeeker, error) { mock := NewMockDataFileSetSeeker(ctrl) - mock.EXPECT().Open(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + mock.EXPECT().Open(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) mock.EXPECT().ConcurrentClone().Return(mock, nil) for i := 0; i < defaultFetchConcurrency; i++ { mock.EXPECT().Close().Return(nil) diff --git a/src/dbnode/persist/fs/seek_test.go b/src/dbnode/persist/fs/seek_test.go index 2d8b82b265..e57962e00f 100644 --- a/src/dbnode/persist/fs/seek_test.go +++ b/src/dbnode/persist/fs/seek_test.go @@ -64,7 +64,7 @@ func TestSeekEmptyIndex(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) _, err = s.SeekByID(ident.StringID("foo"), resources) assert.Error(t, err) @@ -104,7 +104,7 @@ func TestSeekDataUnexpectedSize(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) _, err = s.SeekByID(ident.StringID("foo"), resources) @@ -143,7 +143,7 @@ func TestSeekBadChecksum(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) _, err = s.SeekByID(ident.StringID("foo"), resources) @@ -193,7 +193,7 @@ func TestSeek(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) data, err := s.SeekByID(ident.StringID("foo3"), resources) @@ -261,7 +261,7 @@ func TestSeekIDNotExists(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) // Test errSeekIDNotFound when we scan far enough into the index file that @@ -323,7 +323,7 @@ func TestReuseSeeker(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart.Add(-time.Hour), resources) + err = s.Open(testNs1ID, 0, testWriterStart.Add(-time.Hour), 0, resources) assert.NoError(t, err) data, err := s.SeekByID(ident.StringID("foo"), resources) @@ -333,7 +333,7 @@ func TestReuseSeeker(t *testing.T) { defer data.DecRef() assert.Equal(t, []byte{1, 2, 1}, data.Bytes()) - err = s.Open(testNs1ID, 0, testWriterStart, resources) + err = s.Open(testNs1ID, 0, testWriterStart, 0, resources) assert.NoError(t, err) data, err = s.SeekByID(ident.StringID("foo"), resources) @@ -388,7 +388,7 @@ func TestCloneSeeker(t *testing.T) { resources := newTestReusableSeekerResources() s := newTestSeeker(filePathPrefix) - err = s.Open(testNs1ID, 0, testWriterStart.Add(-time.Hour), resources) + err = s.Open(testNs1ID, 0, testWriterStart.Add(-time.Hour), 0, resources) assert.NoError(t, err) clone, err := s.ConcurrentClone() diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go index e4514927e1..5db2b78542 100644 --- a/src/dbnode/persist/fs/types.go +++ b/src/dbnode/persist/fs/types.go @@ -173,6 +173,7 @@ type DataFileSetSeeker interface { namespace ident.ID, shard uint32, start time.Time, + volume int, resources ReusableSeekerResources, ) error diff --git a/src/dbnode/storage/namespace_readers.go b/src/dbnode/storage/namespace_readers.go index b4d97eede2..d8d2820e75 100644 --- a/src/dbnode/storage/namespace_readers.go +++ b/src/dbnode/storage/namespace_readers.go @@ -77,11 +77,12 @@ type databaseNamespaceReaderManager interface { close() } -type fsFileSetExistsAtFn func( +type fsFileSetExistsFn func( prefix string, namespace ident.ID, shard uint32, blockStart time.Time, + volume int, ) (bool, error) type fsNewReaderFn func( @@ -92,8 +93,8 @@ type fsNewReaderFn func( type namespaceReaderManager struct { sync.Mutex - filesetExistsAtFn fsFileSetExistsAtFn - newReaderFn fsNewReaderFn + filesetExistsFn fsFileSetExistsFn + newReaderFn fsNewReaderFn namespace namespace.Metadata fsOpts fs.Options @@ -151,14 +152,14 @@ func newNamespaceReaderManager( opts Options, ) databaseNamespaceReaderManager { return &namespaceReaderManager{ - filesetExistsAtFn: fs.DataFileSetExistsAt, - newReaderFn: fs.NewReader, - namespace: namespace, - fsOpts: opts.CommitLogOptions().FilesystemOptions(), - bytesPool: opts.BytesPool(), - logger: opts.InstrumentOptions().Logger(), - openReaders: make(map[cachedOpenReaderKey]cachedReader), - metrics: newNamespaceReaderManagerMetrics(namespaceScope), + filesetExistsFn: fs.DataFileSetExists, + newReaderFn: fs.NewReader, + namespace: namespace, + fsOpts: opts.CommitLogOptions().FilesystemOptions(), + bytesPool: opts.BytesPool(), + logger: opts.InstrumentOptions().Logger(), + openReaders: make(map[cachedOpenReaderKey]cachedReader), + metrics: newNamespaceReaderManagerMetrics(namespaceScope), } } @@ -166,8 +167,10 @@ func (m *namespaceReaderManager) filesetExistsAt( shard uint32, blockStart time.Time, ) (bool, error) { - return m.filesetExistsAtFn(m.fsOpts.FilePathPrefix(), - m.namespace.ID(), shard, blockStart) + // TODO(juchan): get the actual volume here. + vol := 0 + return m.filesetExistsFn(m.fsOpts.FilePathPrefix(), + m.namespace.ID(), shard, blockStart, vol) } type cachedReaderForKeyResult struct { diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index 475c3187b7..9cee76c0db 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -1902,9 +1902,9 @@ func (s *dbShard) WarmFlush( // happen first before cold flushes happen. VolumeIndex: 0, // We explicitly set delete if exists to false here as we track which - // filesets exists at bootstrap time so we should never encounter a time - // when we attempt to flush and a fileset already exists unless there is - // racing competing processes. + // filesets exist at bootstrap time so we should never encounter a time + // when we attempt to flush and a fileset that already exists unless + // there are racing competing processes. DeleteIfExists: false, } prepared, err := flushPreparer.PrepareData(prepareOpts) From 9f69ff49e8d426e71a1f107bca6cb5047c3dc30a Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Fri, 14 Jun 2019 15:53:40 -0400 Subject: [PATCH 04/12] Bug fixes --- src/dbnode/persist/fs/files.go | 12 +++++++----- src/dbnode/persist/fs/read.go | 16 +++++++++------- src/dbnode/persist/fs/read_test.go | 4 ++-- src/dbnode/storage/namespace_readers.go | 11 ++++++----- 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 7789c35778..b36d93fd92 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -1304,14 +1304,17 @@ func DataFileSetExists( blockStart time.Time, volume int, ) (bool, error) { + // This function is in some cases the bottleneck, thus, we use different + // logic to handle this case specifically (don't use DataFiles(...)). shardDir := ShardDataDirPath(filePathPrefix, namespace, shard) + // Check fileset with volume first to optimize for non-legacy use case. checkpointPath := filesetPathFromTimeAndIndex(shardDir, blockStart, volume, checkpointFileSuffix) - if volume != 0 { - return CompleteCheckpointFileExists(checkpointPath) + exists, err := CompleteCheckpointFileExists(checkpointPath) + if err == nil && exists { + return true, nil } - // Check legacy format next only if volume is 0. checkpointPath = filesetPathFromTimeLegacy(shardDir, blockStart, checkpointFileSuffix) return CompleteCheckpointFileExists(checkpointPath) } @@ -1500,8 +1503,7 @@ func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error return true, nil } - return false, fmt.Errorf("first volume does not exist for prefix: %s, time: %v, suffix: %s", - prefix, t, suffix) + return false, ErrCheckpointFileNotFound } func dataFilesetPathFromTimeAndIndex( diff --git a/src/dbnode/persist/fs/read.go b/src/dbnode/persist/fs/read.go index 04815deb5f..ed986e6520 100644 --- a/src/dbnode/persist/fs/read.go +++ b/src/dbnode/persist/fs/read.go @@ -146,13 +146,6 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { dataFilepath string ) - isLegacy := false - if volumeIndex == 0 { - isLegacy, err = isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) - if err != nil { - return err - } - } switch opts.FileSetType { case persist.FileSetSnapshotType: shardDir = ShardSnapshotsDirPath(r.filePathPrefix, namespace, shard) @@ -164,6 +157,15 @@ func (r *reader) Open(opts DataReaderOpenOptions) error { dataFilepath = filesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) case persist.FileSetFlushType: shardDir = ShardDataDirPath(r.filePathPrefix, namespace, shard) + + isLegacy := false + if volumeIndex == 0 { + isLegacy, err = isFirstVolumeLegacy(shardDir, blockStart, checkpointFileSuffix) + if err != nil { + return err + } + } + checkpointFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, checkpointFileSuffix, isLegacy) infoFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, infoFileSuffix, isLegacy) digestFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, digestFileSuffix, isLegacy) diff --git a/src/dbnode/persist/fs/read_test.go b/src/dbnode/persist/fs/read_test.go index 52c504ce74..573064d4d9 100644 --- a/src/dbnode/persist/fs/read_test.go +++ b/src/dbnode/persist/fs/read_test.go @@ -271,7 +271,7 @@ func TestReadNoCheckpointFile(t *testing.T) { var ( shardDir = ShardDataDirPath(filePathPrefix, testNs1ID, shard) - checkpointFile = filesetPathFromTimeLegacy(shardDir, testWriterStart, checkpointFileSuffix) + checkpointFile = dataFilesetPathFromTimeAndIndex(shardDir, testWriterStart, 0, checkpointFileSuffix, false) ) exists, err := CompleteCheckpointFileExists(checkpointFile) require.NoError(t, err) @@ -316,7 +316,7 @@ func testReadOpen(t *testing.T, fileData map[string][]byte) { assert.NoError(t, w.Close()) for suffix, data := range fileData { - digestFile := filesetPathFromTimeLegacy(shardDir, start, suffix) + digestFile := dataFilesetPathFromTimeAndIndex(shardDir, start, 0, suffix, false) fd, err := os.OpenFile(digestFile, os.O_WRONLY|os.O_TRUNC, os.FileMode(0666)) require.NoError(t, err) _, err = fd.Write(data) diff --git a/src/dbnode/storage/namespace_readers.go b/src/dbnode/storage/namespace_readers.go index d8d2820e75..b299726765 100644 --- a/src/dbnode/storage/namespace_readers.go +++ b/src/dbnode/storage/namespace_readers.go @@ -21,7 +21,6 @@ package storage import ( - "fmt" "sync" "time" @@ -256,15 +255,17 @@ func (m *namespaceReaderManager) get( // We have a closed reader from the cache (either a cached closed // reader or newly allocated, either way need to prepare it) reader := lookup.closedReader + // TODO(juchan): get the actual volume here. + vol := 0 openOpts := fs.DataReaderOpenOptions{ Identifier: fs.FileSetFileIdentifier{ - Namespace: m.namespace.ID(), - Shard: shard, - BlockStart: blockStart, + Namespace: m.namespace.ID(), + Shard: shard, + BlockStart: blockStart, + VolumeIndex: vol, }, } if err := reader.Open(openOpts); err != nil { - fmt.Printf("j>> %+v\n", 2) return nil, err } From 6a195f7475c61a5eec01a8fc3297e308cf900ade Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Fri, 14 Jun 2019 16:06:50 -0400 Subject: [PATCH 05/12] Rename function --- src/dbnode/integration/disk_cleanup_index_test.go | 2 +- src/dbnode/persist/fs/files.go | 8 ++++---- src/dbnode/persist/fs/files_test.go | 4 ++-- src/dbnode/storage/index.go | 2 +- src/dbnode/storage/shard.go | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/dbnode/integration/disk_cleanup_index_test.go b/src/dbnode/integration/disk_cleanup_index_test.go index 36b53ff855..d622a633b2 100644 --- a/src/dbnode/integration/disk_cleanup_index_test.go +++ b/src/dbnode/integration/disk_cleanup_index_test.go @@ -26,9 +26,9 @@ import ( "testing" "time" + "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/retention" - "github.com/m3db/m3/src/dbnode/namespace" xclock "github.com/m3db/m3/src/x/clock" "github.com/stretchr/testify/require" diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index b36d93fd92..96479971b7 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -980,8 +980,8 @@ func DeleteFileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, bl return DeleteFiles(fileset.AbsoluteFilepaths) } -// DataFilePathsBefore returns all the flush data fileset paths whose timestamps are earlier than a given time. -func DataFilePathsBefore(filePathPrefix string, namespace ident.ID, shard uint32, t time.Time) ([]string, error) { +// DataFileSetsBefore returns all the flush data fileset paths whose timestamps are earlier than a given time. +func DataFileSetsBefore(filePathPrefix string, namespace ident.ID, shard uint32, t time.Time) ([]string, error) { matched, err := filesetFiles(filesetFilesSelector{ fileSetType: persist.FileSetFlushType, contentType: persist.FileSetDataContentType, @@ -996,8 +996,8 @@ func DataFilePathsBefore(filePathPrefix string, namespace ident.ID, shard uint32 return FilesBefore(matched.Filepaths(), t) } -// IndexFilePathsBefore returns all the flush index fileset paths whose timestamps are earlier than a given time. -func IndexFilePathsBefore(filePathPrefix string, namespace ident.ID, t time.Time) ([]string, error) { +// IndexFileSetsBefore returns all the flush index fileset paths whose timestamps are earlier than a given time. +func IndexFileSetsBefore(filePathPrefix string, namespace ident.ID, t time.Time) ([]string, error) { matched, err := filesetFiles(filesetFilesSelector{ fileSetType: persist.FileSetFlushType, contentType: persist.FileSetIndexContentType, diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index 91db576886..b28243f1b9 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -475,7 +475,7 @@ func TestFileSetFilesBefore(t *testing.T) { cutoffIter := 8 cutoff := time.Unix(0, int64(cutoffIter)) - res, err := DataFilePathsBefore(dir, testNs1ID, shard, cutoff) + res, err := DataFileSetsBefore(dir, testNs1ID, shard, cutoff) require.NoError(t, err) require.Equal(t, cutoffIter, len(res)) @@ -1073,7 +1073,7 @@ func TestIndexFileSetsBefore(t *testing.T) { } files.create(t, dir) - results, err := IndexFilePathsBefore(dir, ns1, timeFor(3)) + results, err := IndexFileSetsBefore(dir, ns1, timeFor(3)) require.NoError(t, err) require.Len(t, results, 3) for _, res := range results { diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go index c01764529a..4e0a60dfde 100644 --- a/src/dbnode/storage/index.go +++ b/src/dbnode/storage/index.go @@ -277,7 +277,7 @@ func newNamespaceIndexWithOptions( bufferFuture: nsMD.Options().RetentionOptions().BufferFuture(), coldWritesEnabled: nsMD.Options().ColdWritesEnabled(), - indexFilesetsBeforeFn: fs.IndexFilePathsBefore, + indexFilesetsBeforeFn: fs.IndexFileSetsBefore, deleteFilesFn: fs.DeleteFiles, newBlockFn: newBlockFn, diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index 9cee76c0db..ec161c7e95 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -269,7 +269,7 @@ func newDatabaseShard( newMergerFn: fs.NewMerger, newFSMergeWithMemFn: newFSMergeWithMem, filesetsFn: fs.DataFiles, - filesetPathsBeforeFn: fs.DataFilePathsBefore, + filesetPathsBeforeFn: fs.DataFileSetsBefore, deleteFilesFn: fs.DeleteFiles, snapshotFilesFn: fs.SnapshotFiles, sleepFn: time.Sleep, From 1516d46e499d82f8309a422fa62352fe94031838 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Fri, 14 Jun 2019 17:27:53 -0400 Subject: [PATCH 06/12] Fix more tests --- src/cmd/tools/dtest/util/seed/generator_test.go | 16 +++++----------- src/dbnode/persist/fs/files.go | 14 +++++++------- src/dbnode/persist/fs/index_lookup_prop_test.go | 6 +++--- .../bootstrapper/fs/source_data_test.go | 6 +++--- 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/src/cmd/tools/dtest/util/seed/generator_test.go b/src/cmd/tools/dtest/util/seed/generator_test.go index 364fb0da06..31a73196dc 100644 --- a/src/cmd/tools/dtest/util/seed/generator_test.go +++ b/src/cmd/tools/dtest/util/seed/generator_test.go @@ -128,18 +128,12 @@ func (t *fileInfoExtractor) visit(fPath string, f os.FileInfo, err error) error t.shards[uint32(shardNum)] = struct{}{} name := f.Name() - first := strings.Index(name, "-") - if first == -1 { - return fmt.Errorf("unable to find '-' in %v", name) + nameSplit := strings.Split(name, "-") + if len(nameSplit) < 2 { + return fmt.Errorf("unable to parse time from %v", name) } - last := strings.LastIndex(name, "-") - if last == -1 { - return fmt.Errorf("unable to find '-' in %v", name) - } - if first == last { - return fmt.Errorf("found only single '-' in %v", name) - } - num, parseErr := strconv.ParseInt(name[first+1:last], 10, 64) + + num, parseErr := strconv.ParseInt(nameSplit[1], 10, 64) if parseErr != nil { return err } diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 96479971b7..be6594d674 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -633,13 +633,6 @@ func forEachInfoFile( t := matched[i].ID.BlockStart volume := matched[i].ID.VolumeIndex - isLegacy := false - if volume == 0 { - isLegacy, err = isFirstVolumeLegacy(dir, t, checkpointFileSuffix) - if err != nil { - continue - } - } var ( checkpointFilePath string digestsFilePath string @@ -649,6 +642,13 @@ func forEachInfoFile( case persist.FileSetFlushType: switch args.contentType { case persist.FileSetDataContentType: + isLegacy := false + if volume == 0 { + isLegacy, err = isFirstVolumeLegacy(dir, t, checkpointFileSuffix) + if err != nil { + continue + } + } checkpointFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, checkpointFileSuffix, isLegacy) digestsFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, digestFileSuffix, isLegacy) infoFilePath = dataFilesetPathFromTimeAndIndex(dir, t, volume, infoFileSuffix, isLegacy) diff --git a/src/dbnode/persist/fs/index_lookup_prop_test.go b/src/dbnode/persist/fs/index_lookup_prop_test.go index 55b185c9a5..e634381382 100644 --- a/src/dbnode/persist/fs/index_lookup_prop_test.go +++ b/src/dbnode/persist/fs/index_lookup_prop_test.go @@ -107,8 +107,8 @@ func TestIndexLookupWriteRead(t *testing.T) { } // Read the summaries file into memory - summariesFilePath := filesetPathFromTimeLegacy( - shardDirPath, testWriterStart, summariesFileSuffix) + summariesFilePath := dataFilesetPathFromTimeAndIndex( + shardDirPath, testWriterStart, 0, summariesFileSuffix, false) summariesFile, err := os.Open(summariesFilePath) if err != nil { return false, fmt.Errorf("err opening summaries file: %v, ", err) @@ -254,7 +254,7 @@ func genTagIdent() gopter.Gen { } func readIndexFileOffsets(shardDirPath string, numEntries int, start time.Time) (map[string]int64, error) { - indexFilePath := filesetPathFromTimeLegacy(shardDirPath, start, indexFileSuffix) + indexFilePath := dataFilesetPathFromTimeAndIndex(shardDirPath, start, 0, indexFileSuffix, false) buf, err := ioutil.ReadFile(indexFilePath) if err != nil { return nil, fmt.Errorf("err reading index file: %v, ", err) diff --git a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go index cc3258c97c..95ca8f394e 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go @@ -107,19 +107,19 @@ func createTempDir(t *testing.T) string { func writeInfoFile(t *testing.T, prefix string, namespace ident.ID, shard uint32, start time.Time, data []byte) { shardDir := fs.ShardDataDirPath(prefix, namespace, shard) - filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-info.db", xtime.ToNanoseconds(start))) + filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-0-info.db", xtime.ToNanoseconds(start))) writeFile(t, filePath, data) } func writeDataFile(t *testing.T, prefix string, namespace ident.ID, shard uint32, start time.Time, data []byte) { shardDir := fs.ShardDataDirPath(prefix, namespace, shard) - filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-data.db", xtime.ToNanoseconds(start))) + filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-0-data.db", xtime.ToNanoseconds(start))) writeFile(t, filePath, data) } func writeDigestFile(t *testing.T, prefix string, namespace ident.ID, shard uint32, start time.Time, data []byte) { shardDir := fs.ShardDataDirPath(prefix, namespace, shard) - filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-digest.db", xtime.ToNanoseconds(start))) + filePath := path.Join(shardDir, fmt.Sprintf("fileset-%d-0-digest.db", xtime.ToNanoseconds(start))) writeFile(t, filePath, data) } From 83803bc78c645e31cb395e87563823c1c20dd301 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Tue, 18 Jun 2019 17:25:04 -0400 Subject: [PATCH 07/12] Redo filename parsing --- src/dbnode/persist/fs/files.go | 133 +++++++++++++++++------ src/dbnode/persist/fs/files_test.go | 8 +- src/dbnode/persist/fs/fs.go | 7 +- src/dbnode/persist/fs/merger.go | 4 +- src/dbnode/persist/fs/persist_manager.go | 4 +- 5 files changed, 111 insertions(+), 45 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index be6594d674..05c2f80ea2 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -65,6 +65,7 @@ const ( // check for both indexed and non-indexed filenames. unindexedFilesetIndex = 0 + timeComponentPosition = 1 commitLogComponentPosition = 2 indexFileSetComponentPosition = 2 @@ -72,6 +73,8 @@ const ( numComponentsSnapshotMetadataCheckpointFile = 5 snapshotMetadataUUIDComponentPosition = 1 snapshotMetadataIndexComponentPosition = 2 + + errUnexpectedFilename = "unexpected filename: %s" ) var ( @@ -408,23 +411,62 @@ func (a fileSetFilesByTimeAndVolumeIndexAscending) Less(i, j int) bool { return ti.Equal(tj) && ii < ij } -func componentsAndTimeFromFileName(fname string) ([]string, time.Time, error) { - components := strings.Split(filepath.Base(fname), separator) - if len(components) < 3 { - return nil, timeZero, fmt.Errorf("unexpected file name %s", fname) +func intComponentFromFilename( + baseFilename string, + componentPos int, + separatorIdx [4]int, +) (int64, error) { + var start int + if componentPos > 0 { + start = separatorIdx[componentPos-1] + 1 + } + end := separatorIdx[componentPos] + if start > end || end > len(baseFilename)-1 || start < 0 { + return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) } - nanoseconds, err := strconv.ParseInt(components[1], 10, 64) + num, err := strconv.ParseInt(baseFilename[start:end], 10, 64) if err != nil { - return nil, timeZero, err + return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) } - return components, time.Unix(0, nanoseconds), nil + return num, nil +} + +func delimiterPositions(baseFilename string) ([4]int, int) { + var ( + delimPos [4]int + delimsFound int + ) + + for i := range baseFilename { + if r := baseFilename[i]; r == separatorRune || r == fileSuffixDelimeterRune { + delimPos[delimsFound] = i + delimsFound++ + + if delimsFound == len(delimPos) { + // Found the maximum expected number of separators. + break + } + } + } + + return delimPos, delimsFound } // TimeFromFileName extracts the block start time from file name. func TimeFromFileName(fname string) (time.Time, error) { - _, t, err := componentsAndTimeFromFileName(fname) - return t, err + base := filepath.Base(fname) + + delims, delimsFound := delimiterPositions(base) + if delimsFound < 2 { + return timeZero, fmt.Errorf(errUnexpectedFilename, fname) + } + nanos, err := intComponentFromFilename(base, timeComponentPosition, delims) + if err != nil { + return timeZero, fmt.Errorf(errUnexpectedFilename, fname) + } + + return time.Unix(0, nanos), nil } // TimeAndIndexFromCommitlogFilename extracts the block start and index from @@ -435,30 +477,33 @@ func TimeAndIndexFromCommitlogFilename(fname string) (time.Time, int, error) { // TimeAndVolumeIndexFromDataFileSetFilename extracts the block start and volume // index from a data fileset file name that may or may not have an index. If the -// file name does not include an index, defaultFilesetIndex is returned as the +// file name does not include an index, unindexedFilesetIndex is returned as the // volume index. func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, error) { - components, t, err := componentsAndTimeFromFileName(fname) + base := filepath.Base(fname) + + delims, delimsFound := delimiterPositions(base) + if delimsFound < 3 { + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + } + + nanos, err := intComponentFromFilename(base, timeComponentPosition, delims) if err != nil { - return timeZero, 0, err + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) } + unixNanos := time.Unix(0, nanos) - if len(components) == 3 { - // Three components mean that there is no index in the filename. - return t, unindexedFilesetIndex, nil + // Legacy filename with no volume index. + if delimsFound == 3 { + return unixNanos, unindexedFilesetIndex, nil } - if len(components) == 4 { - // Four components mean that there is an index in the filename. - str := strings.Replace(components[indexFileSetComponentPosition], fileSuffix, "", 1) - index, err := strconv.ParseInt(str, 10, 64) - if err != nil { - return timeZero, 0, err - } - return t, int(index), nil + volume, err := intComponentFromFilename(base, 2, delims) + if err != nil { + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) } - return timeZero, 0, fmt.Errorf("malformed data fileset filename: %s", fname) + return unixNanos, int(volume), nil } // TimeAndVolumeIndexFromFileSetFilename extracts the block start and @@ -468,21 +513,25 @@ func TimeAndVolumeIndexFromFileSetFilename(fname string) (time.Time, int, error) } func timeAndIndexFromFileName(fname string, componentPosition int) (time.Time, int, error) { - components, t, err := componentsAndTimeFromFileName(fname) - if err != nil { - return timeZero, 0, err + base := filepath.Base(fname) + + delims, delimsFound := delimiterPositions(base) + if componentPosition > delimsFound { + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) } - if componentPosition > len(components)-1 { - return timeZero, 0, fmt.Errorf("malformed filename: %s", fname) + nanos, err := intComponentFromFilename(base, 1, delims) + if err != nil { + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) } + unixNanos := time.Unix(0, nanos) - str := strings.Replace(components[componentPosition], fileSuffix, "", 1) - index, err := strconv.ParseInt(str, 10, 64) + index, err := intComponentFromFilename(base, componentPosition, delims) if err != nil { - return timeZero, 0, err + return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) } - return t, int(index), nil + + return unixNanos, int(index), nil } // SnapshotTimeAndID returns the metadata for the snapshot. @@ -926,8 +975,17 @@ func FileSetAt(filePathPrefix string, namespace ident.ID, shard uint32, blockSta } matched.sortByTimeAndVolumeIndexAscending() - for _, fileset := range matched { + for i, fileset := range matched { if fileset.ID.BlockStart.Equal(blockStart) && fileset.ID.VolumeIndex == volume { + nextIdx := i + 1 + if nextIdx < len(matched) && matched[nextIdx].ID.BlockStart.Equal(blockStart) { + // Should never happen. + return FileSetFile{}, false, fmt.Errorf( + "found multiple fileset files for blockStart: %d", + blockStart.Unix(), + ) + } + if !fileset.HasCompleteCheckpointFile() { continue } @@ -1488,7 +1546,8 @@ func filesetPathFromTimeAndIndex(prefix string, t time.Time, index int, suffix s // isFirstVolumeLegacy returns whether the first volume of the provided type is // legacy, i.e. does not have a volume index in its filename. It only checks for // the existence of the file and the non-legacy version of the file, and returns -// and error if neither exist. +// an error if neither exist. Note that this function does not check for the +// volume's complete checkpoint file. func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error) { // Check non-legacy path first to optimize for newer files. path := filesetPathFromTimeAndIndex(prefix, t, 0, suffix) @@ -1506,6 +1565,10 @@ func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error return false, ErrCheckpointFileNotFound } +// Once we decide that we no longer want to support legacy (non-volume-indexed) +// filesets, we can remove this function and just use +// `filesetPathFromTimeAndIndex`. Getting code to compile and tests to pass +// after that should be a comprehensive way to remove dead code. func dataFilesetPathFromTimeAndIndex( prefix string, t time.Time, diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index b28243f1b9..f68fc968bc 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -215,7 +215,7 @@ func TestForEachInfoFile(t *testing.T) { func TestTimeFromFileName(t *testing.T) { _, err := TimeFromFileName("foo/bar") require.Error(t, err) - require.Equal(t, "unexpected file name foo/bar", err.Error()) + require.Equal(t, "unexpected filename: foo/bar", err.Error()) _, err = TimeFromFileName("foo/bar-baz") require.Error(t, err) @@ -239,7 +239,7 @@ func TestTimeFromFileName(t *testing.T) { func TestTimeAndIndexFromCommitlogFileName(t *testing.T) { _, _, err := TimeAndIndexFromCommitlogFilename("foo/bar") require.Error(t, err) - require.Equal(t, "unexpected file name foo/bar", err.Error()) + require.Equal(t, "unexpected filename: foo/bar", err.Error()) _, _, err = TimeAndIndexFromCommitlogFilename("foo/bar-baz") require.Error(t, err) @@ -264,7 +264,7 @@ func TestTimeAndIndexFromCommitlogFileName(t *testing.T) { func TestTimeAndVolumeIndexFromFileSetFilename(t *testing.T) { _, _, err := TimeAndVolumeIndexFromFileSetFilename("foo/bar") require.Error(t, err) - require.Equal(t, "unexpected file name foo/bar", err.Error()) + require.Equal(t, "unexpected filename: foo/bar", err.Error()) _, _, err = TimeAndVolumeIndexFromFileSetFilename("foo/bar-baz") require.Error(t, err) @@ -291,7 +291,7 @@ func TestTimeAndVolumeIndexFromFileSetFilename(t *testing.T) { func TestTimeAndVolumeIndexFromDataFileSetFilename(t *testing.T) { _, _, err := TimeAndVolumeIndexFromDataFileSetFilename("foo/bar") require.Error(t, err) - require.Equal(t, "unexpected file name foo/bar", err.Error()) + require.Equal(t, "unexpected filename: foo/bar", err.Error()) _, _, err = TimeAndVolumeIndexFromDataFileSetFilename("foo/bar-baz") require.Error(t, err) diff --git a/src/dbnode/persist/fs/fs.go b/src/dbnode/persist/fs/fs.go index a29a118393..ae3e727f6f 100644 --- a/src/dbnode/persist/fs/fs.go +++ b/src/dbnode/persist/fs/fs.go @@ -34,13 +34,16 @@ const ( filesetFilePrefix = "fileset" commitLogFilePrefix = "commitlog" segmentFileSetFilePrefix = "segment" - fileSuffix = ".db" + + fileSuffix = ".db" + fileSuffixDelimeterRune = '.' anyLowerCaseCharsPattern = "[a-z]*" anyNumbersPattern = "[0-9]*" anyLowerCaseCharsNumbersPattern = "[a-z0-9]*" - separator = "-" + separator = "-" + separatorRune = '-' infoFilePattern = filesetFilePrefix + separator + anyNumbersPattern + separator + infoFileSuffix + fileSuffix filesetFilePattern = filesetFilePrefix + separator + anyNumbersPattern + separator + anyLowerCaseCharsPattern + fileSuffix diff --git a/src/dbnode/persist/fs/merger.go b/src/dbnode/persist/fs/merger.go index e9310f9903..4e167d0fe9 100644 --- a/src/dbnode/persist/fs/merger.go +++ b/src/dbnode/persist/fs/merger.go @@ -192,10 +192,10 @@ func (m *merger) Merge( // First stage: loop through series on disk. for id, tagsIter, data, _, err := reader.Read(); err != io.EOF; id, tagsIter, data, _, err = reader.Read() { - idsToFinalize = append(idsToFinalize, id) if err != nil { return err } + idsToFinalize = append(idsToFinalize, id) // Reset BlockReaders. brs = brs[:0] @@ -220,10 +220,10 @@ func (m *merger) Merge( // are valid, and the IDs are valid for the duration of the file writing. tags, err := convert.TagsFromTagsIter(id, tagsIter, identPool) tagsIter.Close() - tagsToFinalize = append(tagsToFinalize, tags) if err != nil { return err } + tagsToFinalize = append(tagsToFinalize, tags) if err := persistIter(prepared.Persist, multiIter, startTime, id, tags, blockAllocSize, nsCtx.Schema, encoderPool); err != nil { return err diff --git a/src/dbnode/persist/fs/persist_manager.go b/src/dbnode/persist/fs/persist_manager.go index d7ce286125..ab343cec3c 100644 --- a/src/dbnode/persist/fs/persist_manager.go +++ b/src/dbnode/persist/fs/persist_manager.go @@ -424,8 +424,8 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. var volumeIndex int switch opts.FileSetType { case persist.FileSetFlushType: - // Use the volume index passed in. This ensures that this index is the - // same as the flush index. + // Use the volume index passed in. This ensures that the volume index is + // the same as the cold flush version. volumeIndex = opts.VolumeIndex case persist.FileSetSnapshotType: // Need to work out the volume index for the next snapshot. From ead04996ef13f402def5f6e768b54641fe4744ef Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Tue, 18 Jun 2019 17:33:47 -0400 Subject: [PATCH 08/12] Regen mocks --- src/dbnode/storage/storage_mock.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go index 4c048812fb..4431f7bdb0 100644 --- a/src/dbnode/storage/storage_mock.go +++ b/src/dbnode/storage/storage_mock.go @@ -1789,6 +1789,20 @@ func (mr *MockdatabaseShardMockRecorder) CleanupExpiredFileSets(earliestToRetain return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CleanupExpiredFileSets", reflect.TypeOf((*MockdatabaseShard)(nil).CleanupExpiredFileSets), earliestToRetain) } +// CleanupCompactedFileSets mocks base method +func (m *MockdatabaseShard) CleanupCompactedFileSets() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CleanupCompactedFileSets") + ret0, _ := ret[0].(error) + return ret0 +} + +// CleanupCompactedFileSets indicates an expected call of CleanupCompactedFileSets +func (mr *MockdatabaseShardMockRecorder) CleanupCompactedFileSets() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CleanupCompactedFileSets", reflect.TypeOf((*MockdatabaseShard)(nil).CleanupCompactedFileSets)) +} + // Repair mocks base method func (m *MockdatabaseShard) Repair(ctx context.Context, nsCtx namespace.Context, tr time0.Range, repairer databaseShardRepairer) (repair.MetadataComparisonResult, error) { m.ctrl.T.Helper() From 40f98f145da45950651b4f441c1513e4d2621b82 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Wed, 19 Jun 2019 10:17:22 -0400 Subject: [PATCH 09/12] Use block lease manager to open correct volume in seeker --- src/dbnode/persist/fs/files.go | 58 ++++++++++++++++----------- src/dbnode/persist/fs/seek_manager.go | 30 ++++++-------- 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 05c2f80ea2..c664a3f315 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -58,6 +58,10 @@ const ( snapshotDirName = "snapshots" commitLogsDirName = "commitlogs" + // The maximum number of delimeters ('-' or '.') that is expected in a + // (base) filename. + maxDelimNum = 4 + // The volume index assigned to (legacy) filesets that don't have a volume // number in their filename. // NOTE: Since this index is the same as the index for the first @@ -411,30 +415,14 @@ func (a fileSetFilesByTimeAndVolumeIndexAscending) Less(i, j int) bool { return ti.Equal(tj) && ii < ij } -func intComponentFromFilename( - baseFilename string, - componentPos int, - separatorIdx [4]int, -) (int64, error) { - var start int - if componentPos > 0 { - start = separatorIdx[componentPos-1] + 1 - } - end := separatorIdx[componentPos] - if start > end || end > len(baseFilename)-1 || start < 0 { - return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) - } - - num, err := strconv.ParseInt(baseFilename[start:end], 10, 64) - if err != nil { - return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) - } - return num, nil -} - -func delimiterPositions(baseFilename string) ([4]int, int) { +// Returns the positions of filename delimiters ('-' and '.') and the number of +// delimeters found, to be used in conjunction with the intComponentFromFilename +// function to extract filename components. This function is deliberately +// optimized for speed and lack of allocationsm, since filename parsing is known +// to account for a significant proportion of sytstem resources. +func delimiterPositions(baseFilename string) ([maxDelimNum]int, int) { var ( - delimPos [4]int + delimPos [maxDelimNum]int delimsFound int ) @@ -453,6 +441,30 @@ func delimiterPositions(baseFilename string) ([4]int, int) { return delimPos, delimsFound } +// Returns the the specified component of a filename given delimeter positions. +// Our only use cases involve extracting numeric components, so this function +// assumes this and returns the component as an int64. +func intComponentFromFilename( + baseFilename string, + componentPos int, + delimPos [maxDelimNum]int, +) (int64, error) { + var start int + if componentPos > 0 { + start = delimPos[componentPos-1] + 1 + } + end := delimPos[componentPos] + if start > end || end > len(baseFilename)-1 || start < 0 { + return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) + } + + num, err := strconv.ParseInt(baseFilename[start:end], 10, 64) + if err != nil { + return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) + } + return num, nil +} + // TimeFromFileName extracts the block start time from file name. func TimeFromFileName(fname string) (time.Time, error) { base := filepath.Base(fname) diff --git a/src/dbnode/persist/fs/seek_manager.go b/src/dbnode/persist/fs/seek_manager.go index 664e00ec85..b231f0719a 100644 --- a/src/dbnode/persist/fs/seek_manager.go +++ b/src/dbnode/persist/fs/seek_manager.go @@ -397,9 +397,17 @@ func (m *seekerManager) newOpenSeeker( shard uint32, blockStart time.Time, ) (DataFileSetSeeker, error) { - // TODO(juchan): get the actual volume here. - vol := 0 - exists, err := DataFileSetExists(m.filePathPrefix, m.namespace, shard, blockStart, vol) + blm := m.blockRetrieverOpts.BlockLeaseManager() + state, err := blm.OpenLatestLease(m, block.LeaseDescriptor{ + Namespace: m.namespace, + Shard: shard, + BlockStart: blockStart, + }) + if err != nil { + return nil, fmt.Errorf("err opening latest lease: %v", err) + } + + exists, err := DataFileSetExists(m.filePathPrefix, m.namespace, shard, blockStart, state.Volume) if err != nil { return nil, err } @@ -426,20 +434,8 @@ func (m *seekerManager) newOpenSeeker( // Set the unread buffer to reuse it amongst all seekers. seeker.setUnreadBuffer(m.unreadBuf.value) - var ( - resources = m.getSeekerResources() - blm = m.blockRetrieverOpts.BlockLeaseManager() - ) - _, err = blm.OpenLatestLease(m, block.LeaseDescriptor{ - Namespace: m.namespace, - Shard: shard, - BlockStart: blockStart, - }) - if err != nil { - return nil, fmt.Errorf("err opening latest lease: %v", err) - } - - err = seeker.Open(m.namespace, shard, blockStart, volume, resources) + resources := m.getSeekerResources() + err = seeker.Open(m.namespace, shard, blockStart, state.Volume, resources) m.putSeekerResources(resources) if err != nil { return nil, err From 00384ffe17f1ea71c25f6d03c6927d9c4331e795 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Wed, 19 Jun 2019 12:02:37 -0400 Subject: [PATCH 10/12] Fix typos --- src/dbnode/persist/fs/files.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index c664a3f315..50ee3c6de7 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -418,8 +418,8 @@ func (a fileSetFilesByTimeAndVolumeIndexAscending) Less(i, j int) bool { // Returns the positions of filename delimiters ('-' and '.') and the number of // delimeters found, to be used in conjunction with the intComponentFromFilename // function to extract filename components. This function is deliberately -// optimized for speed and lack of allocationsm, since filename parsing is known -// to account for a significant proportion of sytstem resources. +// optimized for speed and lack of allocations, since filename parsing is known +// to account for a significant proportion of system resources. func delimiterPositions(baseFilename string) ([maxDelimNum]int, int) { var ( delimPos [maxDelimNum]int @@ -441,9 +441,10 @@ func delimiterPositions(baseFilename string) ([maxDelimNum]int, int) { return delimPos, delimsFound } -// Returns the the specified component of a filename given delimeter positions. -// Our only use cases involve extracting numeric components, so this function -// assumes this and returns the component as an int64. +// Returns the the specified component of a filename, given the positions of +// delimeters. Our only use cases for this involve extracting numeric +// components, so this function assumes this and returns the component as an +// int64. func intComponentFromFilename( baseFilename string, componentPos int, From 1bb67df8a8b34eedb02f145824a69c8781040dd8 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Thu, 20 Jun 2019 15:14:25 -0400 Subject: [PATCH 11/12] Pass in volume to persist to merger --- src/dbnode/persist/fs/files.go | 67 ++++++++++++++++------------ src/dbnode/persist/fs/merger.go | 3 +- src/dbnode/persist/fs/merger_test.go | 2 +- src/dbnode/persist/fs/types.go | 1 + src/dbnode/storage/shard.go | 25 +++++------ src/dbnode/storage/shard_test.go | 1 + 6 files changed, 56 insertions(+), 43 deletions(-) diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 50ee3c6de7..f5dc1d2d27 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -72,13 +72,14 @@ const ( timeComponentPosition = 1 commitLogComponentPosition = 2 indexFileSetComponentPosition = 2 + dataFileSetComponentPosition = 2 numComponentsSnapshotMetadataFile = 4 numComponentsSnapshotMetadataCheckpointFile = 5 snapshotMetadataUUIDComponentPosition = 1 snapshotMetadataIndexComponentPosition = 2 - errUnexpectedFilename = "unexpected filename: %s" + errUnexpectedFilenamePattern = "unexpected filename: %s" ) var ( @@ -416,10 +417,11 @@ func (a fileSetFilesByTimeAndVolumeIndexAscending) Less(i, j int) bool { } // Returns the positions of filename delimiters ('-' and '.') and the number of -// delimeters found, to be used in conjunction with the intComponentFromFilename +// delimeters found, to be used in conjunction with the intComponentAtIndex // function to extract filename components. This function is deliberately -// optimized for speed and lack of allocations, since filename parsing is known -// to account for a significant proportion of system resources. +// optimized for speed and lack of allocations, since allocation-heavy filename +// parsing can quickly become a large source of allocations in the entire +// system, especially when namespaces with long retentions are configured. func delimiterPositions(baseFilename string) ([maxDelimNum]int, int) { var ( delimPos [maxDelimNum]int @@ -445,23 +447,23 @@ func delimiterPositions(baseFilename string) ([maxDelimNum]int, int) { // delimeters. Our only use cases for this involve extracting numeric // components, so this function assumes this and returns the component as an // int64. -func intComponentFromFilename( +func intComponentAtIndex( baseFilename string, componentPos int, delimPos [maxDelimNum]int, ) (int64, error) { - var start int + start := 0 if componentPos > 0 { start = delimPos[componentPos-1] + 1 } end := delimPos[componentPos] if start > end || end > len(baseFilename)-1 || start < 0 { - return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) + return 0, fmt.Errorf(errUnexpectedFilenamePattern, baseFilename) } num, err := strconv.ParseInt(baseFilename[start:end], 10, 64) if err != nil { - return 0, fmt.Errorf(errUnexpectedFilename, baseFilename) + return 0, fmt.Errorf(errUnexpectedFilenamePattern, baseFilename) } return num, nil } @@ -471,12 +473,15 @@ func TimeFromFileName(fname string) (time.Time, error) { base := filepath.Base(fname) delims, delimsFound := delimiterPositions(base) - if delimsFound < 2 { - return timeZero, fmt.Errorf(errUnexpectedFilename, fname) + // There technically only needs to be two delimeters here since the time + // component is in index 1. However, all DB files have a minimum of three + // delimeters, so check for that instead. + if delimsFound < 3 { + return timeZero, fmt.Errorf(errUnexpectedFilenamePattern, fname) } - nanos, err := intComponentFromFilename(base, timeComponentPosition, delims) + nanos, err := intComponentAtIndex(base, timeComponentPosition, delims) if err != nil { - return timeZero, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, fmt.Errorf(errUnexpectedFilenamePattern, fname) } return time.Unix(0, nanos), nil @@ -497,12 +502,12 @@ func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, er delims, delimsFound := delimiterPositions(base) if delimsFound < 3 { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } - nanos, err := intComponentFromFilename(base, timeComponentPosition, delims) + nanos, err := intComponentAtIndex(base, timeComponentPosition, delims) if err != nil { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } unixNanos := time.Unix(0, nanos) @@ -511,9 +516,9 @@ func TimeAndVolumeIndexFromDataFileSetFilename(fname string) (time.Time, int, er return unixNanos, unindexedFilesetIndex, nil } - volume, err := intComponentFromFilename(base, 2, delims) + volume, err := intComponentAtIndex(base, dataFileSetComponentPosition, delims) if err != nil { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } return unixNanos, int(volume), nil @@ -530,18 +535,18 @@ func timeAndIndexFromFileName(fname string, componentPosition int) (time.Time, i delims, delimsFound := delimiterPositions(base) if componentPosition > delimsFound { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } - nanos, err := intComponentFromFilename(base, 1, delims) + nanos, err := intComponentAtIndex(base, 1, delims) if err != nil { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } unixNanos := time.Unix(0, nanos) - index, err := intComponentFromFilename(base, componentPosition, delims) + index, err := intComponentAtIndex(base, componentPosition, delims) if err != nil { - return timeZero, 0, fmt.Errorf(errUnexpectedFilename, fname) + return timeZero, 0, fmt.Errorf(errUnexpectedFilenamePattern, fname) } return unixNanos, int(index), nil @@ -1375,8 +1380,12 @@ func DataFileSetExists( blockStart time.Time, volume int, ) (bool, error) { - // This function is in some cases the bottleneck, thus, we use different - // logic to handle this case specifically (don't use DataFiles(...)). + // This function can easily become a performance bottleneck if the + // implementation is slow or requires scanning directories with a large + // number of files in them (as is common if namespaces with long retentions + // are configured). As a result, instead of using existing helper functions, + // it implements an optimized code path that only involves checking if a few + // specific files exist and contain the correct contents. shardDir := ShardDataDirPath(filePathPrefix, namespace, shard) // Check fileset with volume first to optimize for non-legacy use case. @@ -1402,6 +1411,8 @@ func SnapshotFileSetExistsAt(prefix string, namespace ident.ID, shard uint32, bl return false, nil } + // LatestVolumeForBlock checks for a complete checkpoint file, so we don't + // need to recheck it here. return true, nil } @@ -1557,10 +1568,10 @@ func filesetPathFromTimeAndIndex(prefix string, t time.Time, index int, suffix s } // isFirstVolumeLegacy returns whether the first volume of the provided type is -// legacy, i.e. does not have a volume index in its filename. It only checks for -// the existence of the file and the non-legacy version of the file, and returns -// an error if neither exist. Note that this function does not check for the -// volume's complete checkpoint file. +// legacy, i.e. does not have a volume index in its filename. Using this +// function, the caller expects there to be a legacy or non-legacy file, and +// thus returns an error if neither exist. Note that this function does not +// check for the volume's complete checkpoint file. func isFirstVolumeLegacy(prefix string, t time.Time, suffix string) (bool, error) { // Check non-legacy path first to optimize for newer files. path := filesetPathFromTimeAndIndex(prefix, t, 0, suffix) diff --git a/src/dbnode/persist/fs/merger.go b/src/dbnode/persist/fs/merger.go index 4e167d0fe9..a7f9f34901 100644 --- a/src/dbnode/persist/fs/merger.go +++ b/src/dbnode/persist/fs/merger.go @@ -82,6 +82,7 @@ func NewMerger( func (m *merger) Merge( fileID FileSetFileIdentifier, mergeWith MergeWith, + nextVolumeIndex int, flushPreparer persist.FlushPreparer, nsCtx namespace.Context, ) (err error) { @@ -130,7 +131,7 @@ func (m *merger) Merge( NamespaceMetadata: nsMd, Shard: shard, BlockStart: startTime, - VolumeIndex: fileID.VolumeIndex + 1, + VolumeIndex: nextVolumeIndex, FileSetType: persist.FileSetFlushType, DeleteIfExists: false, } diff --git a/src/dbnode/persist/fs/merger_test.go b/src/dbnode/persist/fs/merger_test.go index 48f6636c30..b1ec374638 100644 --- a/src/dbnode/persist/fs/merger_test.go +++ b/src/dbnode/persist/fs/merger_test.go @@ -453,7 +453,7 @@ func testMergeWith( BlockStart: startTime, } mergeWith := mockMergeWithFromData(t, ctrl, diskData, mergeTargetData) - err := merger.Merge(fsID, mergeWith, preparer, nsCtx) + err := merger.Merge(fsID, mergeWith, 1, preparer, nsCtx) require.NoError(t, err) assertPersistedAsExpected(t, persisted, expectedData) diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go index 5db2b78542..f0050cdea0 100644 --- a/src/dbnode/persist/fs/types.go +++ b/src/dbnode/persist/fs/types.go @@ -533,6 +533,7 @@ type Merger interface { Merge( fileID FileSetFileIdentifier, mergeWith MergeWith, + nextVolumeIndex int, flushPreparer persist.FlushPreparer, nsCtx namespace.Context, ) error diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index ec161c7e95..345c72e6bf 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -1903,8 +1903,8 @@ func (s *dbShard) WarmFlush( VolumeIndex: 0, // We explicitly set delete if exists to false here as we track which // filesets exist at bootstrap time so we should never encounter a time - // when we attempt to flush and a fileset that already exists unless - // there are racing competing processes. + // where a fileset already exists when we attempt to flush unless there + // is a bug in the code. DeleteIfExists: false, } prepared, err := flushPreparer.PrepareData(prepareOpts) @@ -2011,18 +2011,17 @@ func (s *dbShard) ColdFlush( VolumeIndex: coldVersion, } - err := merger.Merge(fsID, mergeWithMem, flushPreparer, nsCtx) + nextVersion := coldVersion + 1 + err := merger.Merge(fsID, mergeWithMem, nextVersion, flushPreparer, nsCtx) if err != nil { multiErr = multiErr.Add(err) continue } // After writing the full block successfully, update the cold version - // in the flush state. - nextVersion := coldVersion + 1 - - // Once this function completes block leasers will no longer be able to acquire - // leases on previous volumes for the given namespace/shard/blockstart. + // in the flush state. Once this function completes block leasers will + // no longer be able to acquire leases on previous volumes for the given + // namespace/shard/blockstart. s.setFlushStateColdVersion(startTime, nextVersion) // Notify all block leasers that a new volume for the namespace/shard/blockstart @@ -2186,19 +2185,19 @@ func (s *dbShard) CleanupCompactedFileSets() error { filePathPrefix, s.namespace.ID(), s.ID(), err) } // Get a snapshot of all states here to prevent constantly getting/releasing - // locks in a tight loop below. + // locks in a tight loop below. This snapshot won't become stale halfway + // through this because flushing and cleanup never happen in parallel. blockStates := s.BlockStatesSnapshot() - // Filter without allocating by using same backing array. - compacted := filesets[:0] + toDelete := fs.FileSetFilesSlice(make([]fs.FileSetFile, 0, len(filesets))) for _, datafile := range filesets { fileID := datafile.ID blockState := blockStates[xtime.ToUnixNano(fileID.BlockStart)] if fileID.VolumeIndex < blockState.ColdVersion { - compacted = append(compacted, datafile) + toDelete = append(toDelete, datafile) } } - return s.deleteFilesFn(compacted.Filepaths()) + return s.deleteFilesFn(toDelete.Filepaths()) } func (s *dbShard) Repair( diff --git a/src/dbnode/storage/shard_test.go b/src/dbnode/storage/shard_test.go index 28b0a6e6ff..9be40f9a03 100644 --- a/src/dbnode/storage/shard_test.go +++ b/src/dbnode/storage/shard_test.go @@ -425,6 +425,7 @@ type noopMerger struct{} func (m *noopMerger) Merge( fileID fs.FileSetFileIdentifier, mergeWith fs.MergeWith, + nextVersion int, flushPreparer persist.FlushPreparer, nsCtx namespace.Context, ) error { From 01907017a479fbcadb493bc1277683cb0106acf0 Mon Sep 17 00:00:00 2001 From: Justin Chan Date: Fri, 21 Jun 2019 12:12:19 -0400 Subject: [PATCH 12/12] Add changelog explaining backword-/forward-compatibility --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0cde64516..8c9d2b10b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +# 0.11.0 (pending) + +## Migration Disclaimer + +Version 0.11.0 of M3 includes further work on supporting writing data at arbitrary times (within the retention period). While most of these changes are transparent to the user in terms of functionality and performance, we had to make a change to the naming format of the files that get persisted to disk (#1720). This change was required to handle multiple fileset volumes per block, which is necessary after introducing "cold flushes" (#1624). + +The redesign is **backwards compatible** but not **forwards compatible**. This means that you should be able upgrade your < 0.11.0 clusters to 0.11.0 with no issues, but you will not be able to downgrade without taking some additional steps. + +### Troubleshooting and Rolling Back + +If you run into any issues with the upgrade or need to downgrade to a previous version for any reason, follow these steps: + +1. Stop the node that is having trouble with the upgrade or that you're trying to downgrade. +2. Remove the filesets that include a volume index in them, e.g. this is a filename with the new volumed format: `fileset-1257890400000000000-0-data.db`, and this is the corresponding filename in the original format: `fileset-1257890400000000000-data.db`. +3. Modify the `bootstrappers` config in the M3DB YAML file from `filesystem, commitlog, peers, uninitialized_topology` to `filesystem, peers, commitlog, uninitialized_topology`. This will force the node to bootstrap from its peers instead of the local snapshot and commitlog files it has on disk, which is important otherwise when the node restarts, it will think that it has already been bootstrapped. +4. Turn the node back on. + # 0.10.2 ## Performance