Merge #66876

66876: backupccl: buffer/sort returned files before flushing to remote r=dt a=dt BACKUP has multiple workers sending export requests concurrently so the returned files may arrive out of order. Previously this would always force the writer that flushed returned files out to the remote file to close the file it had been writing an open a new one, as any one file must be in-order. This adds a small queue of returned files to the sink to which it adds files it is given to write to remote storage. Once the queue has accumulated, it is sorted and partially drained to the remote file. This should increase the odds that files are added to the remote file in-order and thus do not require closing and re-opening additional remote files. With default worker (3) on a tpcc5k cluster running an incremental backup this was observed to reduce the number of files a node wrote from ~50-70 to <10. Release note: none. Co-authored-by: David Taylor <[email protected]>
cockroachdb · Jun 28, 2021 · 9785a7b · 9785a7b
2 parents ff9ea89 + ee6fe3a
commit 9785a7b
Showing 1 changed file with 84 additions and 7 deletions.
diff --git a/pkg/ccl/backupccl/backup_processor.go b/pkg/ccl/backupccl/backup_processor.go
@@ -12,6 +12,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"sort"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
@@ -79,8 +80,18 @@ var (
 		16<<20,
 		settings.NonNegativeInt,
 	)
+	smallFileBuffer = settings.RegisterByteSizeSetting(
+		"bulkio.backup.merge_file_buffer_size",
+		"size limit used when buffering backup files before merging them",
+		16<<20,
+		settings.NonNegativeInt,
+	)
 )
 
+// maxSinkQueueFiles is how many replies we'll queue up before flushing to allow
+// some re-ordering, unless we hit smallFileBuffer size first.
+const maxSinkQueueFiles = 24
+
 const backupProcessorName = "backupDataProcessor"
 
 // TODO(pbardea): It would be nice if we could add some DistSQL processor tests
@@ -444,6 +455,7 @@ func runBackupProcessor(
 			enc:            spec.Encryption,
 			targetFileSize: targetFileSize,
 			progCh:         progCh,
+			settings:       &flowCtx.Cfg.Settings.SV,
 		}
 
 		defaultStore, err := flowCtx.Cfg.ExternalStorage(ctx, defaultConf)
@@ -486,7 +498,7 @@ func runBackupProcessor(
 				sink = defaultSink
 			}
 
-			if err := sink.write(ctx, res); err != nil {
+			if err := sink.push(ctx, res); err != nil {
 				return err
 			}
 		}
@@ -507,24 +519,41 @@ type sstSinkConf struct {
 	targetFileSize int64
 	enc            *roachpb.FileEncryptionOptions
 	id             base.SQLInstanceID
+	settings       *settings.Values
 }
 
 type sstSink struct {
 	dest cloud.ExternalStorage
 	conf sstSinkConf
 
+	queue     []returnedSST
+	queueSize int
+
 	sst     storage.SSTWriter
 	ctx     context.Context
 	cancel  func()
 	out     io.WriteCloser
 	outName string
 
 	flushedFiles    []BackupManifest_File
+	flushedSize     int64
 	flushedRevStart hlc.Timestamp
 	completedSpans  int32
+
+	stats struct {
+		files       int
+		flushes     int
+		oooFlushes  int
+		sizeFlushes int
+		spanGrows   int
+	}
 }
 
 func (s *sstSink) Close() error {
+	if log.V(1) && s.ctx != nil {
+		log.Infof(s.ctx, "backup sst sink recv'd %d files, wrote %d (%d due to size, %d due to re-ordering), %d recv files extended prior span",
+			s.stats.files, s.stats.flushes, s.stats.sizeFlushes, s.stats.oooFlushes, s.stats.spanGrows)
+	}
 	if s.cancel != nil {
 		s.cancel()
 	}
@@ -534,10 +563,54 @@ func (s *sstSink) Close() error {
 	return nil
 }
 
+// push pushes one returned backup file into the sink. Returned files can arrive
+// out of order, but must be written to an underlying file in-order or else a
+// new underlying file has to be opened. The queue allows buffering up files and
+// sorting them before pushing them to the underlying file to try to avoid this.
+// When the queue length or sum of the data sizes in it exceeds thresholds the
+// queue is sorted and the first half is flushed.
+func (s *sstSink) push(ctx context.Context, resp returnedSST) error {
+	s.queue = append(s.queue, resp)
+	s.queueSize += len(resp.sst)
+
+	if len(s.queue) >= maxSinkQueueFiles || s.queueSize >= int(smallFileBuffer.Get(s.conf.settings)) {
+		sort.Slice(s.queue, func(i, j int) bool { return s.queue[i].f.Span.Key.Compare(s.queue[j].f.Span.Key) < 0 })
+
+		// Drain the first half.
+		drain := len(s.queue) / 2
+		if drain < 1 {
+			drain = 1
+		}
+		for i := range s.queue[:drain] {
+			if err := s.write(ctx, s.queue[i]); err != nil {
+				return err
+			}
+			s.queueSize -= len(s.queue[i].sst)
+		}
+
+		// Shift down the remainder of the queue and slice off the tail.
+		copy(s.queue, s.queue[drain:])
+		s.queue = s.queue[:len(s.queue)-drain]
+	}
+	return nil
+}
+
 func (s *sstSink) flush(ctx context.Context) error {
+	for i := range s.queue {
+		if err := s.write(ctx, s.queue[i]); err != nil {
+			return err
+		}
+	}
+	s.queue = nil
+	return s.flushFile(ctx)
+}
+
+func (s *sstSink) flushFile(ctx context.Context) error {
 	if s.out == nil {
 		return nil
 	}
+	s.stats.flushes++
+
 	if err := s.sst.Finish(); err != nil {
 		return err
 	}
@@ -565,6 +638,7 @@ func (s *sstSink) flush(ctx context.Context) error {
 	}
 
 	s.flushedFiles = nil
+	s.flushedSize = 0
 	s.flushedRevStart.Reset()
 	s.completedSpans = 0
 
@@ -593,14 +667,15 @@ func (s *sstSink) open(ctx context.Context) error {
 }
 
 func (s *sstSink) write(ctx context.Context, resp returnedSST) error {
+	s.stats.files++
+
 	span := resp.f.Span
 
 	// If this span starts before the last buffered span ended, we need to flush
 	// since it overlaps but SSTWriter demands writes in-order.
-	// TODO(dt): consider buffering resp until _next_ `write` to allow minimal
-	// reordering of writes to avoid extra flushes.
 	if len(s.flushedFiles) > 0 && span.Key.Compare(s.flushedFiles[len(s.flushedFiles)-1].Span.EndKey) < 0 {
-		if err := s.flush(ctx); err != nil {
+		s.stats.oooFlushes++
+		if err := s.flushFile(ctx); err != nil {
 			return err
 		}
 	}
@@ -648,18 +723,20 @@ func (s *sstSink) write(ctx context.Context, resp returnedSST) error {
 		s.flushedFiles[l].StartTime.EqOrdering(resp.f.StartTime) {
 		s.flushedFiles[l].Span.EndKey = span.EndKey
 		s.flushedFiles[l].EntryCounts.add(resp.f.EntryCounts)
+		s.stats.spanGrows++
 	} else {
 		f := resp.f
 		f.Path = s.outName
 		s.flushedFiles = append(s.flushedFiles, f)
 	}
 	s.flushedRevStart.Forward(resp.revStart)
 	s.completedSpans += resp.completedSpans
+	s.flushedSize += int64(len(resp.sst))
 
 	// If our accumulated SST is now big enough, flush it.
-	// TODO(dt): use the compressed size instead.
-	if s.sst.DataSize > s.conf.targetFileSize {
-		if err := s.flush(ctx); err != nil {
+	if s.flushedSize > s.conf.targetFileSize {
+		s.stats.sizeFlushes++
+		if err := s.flushFile(ctx); err != nil {
 			return err
 		}
 	}