m3db · arnikola · Nov 6, 2021 · Oct 29, 2021 · Nov 4, 2021 · Nov 4, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+# Upcoming
+
+## Performance
+- **M3Aggregator**: Rework close and remove `persitFlushTimesEvery` semantics in leader flushing in favour of always persisting shard flush times on a successful flush for optimized graceful failovers. ([#3890](https://github.com/m3db/m3/pull/3890))
+
 # 1.3.0
 
 ## Features

diff --git a/scripts/docker-integration-tests/aggregator/m3aggregator.yml b/scripts/docker-integration-tests/aggregator/m3aggregator.yml
@@ -177,8 +177,7 @@ aggregator:
         maxJitterPercent: 0.5
       - flushInterval: 1h
         maxJitterPercent: 0.25
-    numWorkersPerCPU: 0.5
-    flushTimesPersistEvery: 10s
+    numWorkersPerCPU: 0.5 
     maxBufferSize: 5m
     forcedFlushWindowSize: 10s
   flush:

diff --git a/scripts/docker-integration-tests/aggregator_legacy/m3aggregator.yml b/scripts/docker-integration-tests/aggregator_legacy/m3aggregator.yml
@@ -205,8 +205,7 @@ aggregator:
         maxJitterPercent: 0.5
       - flushInterval: 1h
         maxJitterPercent: 0.25
-    numWorkersPerCPU: 0.5
-    flushTimesPersistEvery: 10s
+    numWorkersPerCPU: 0.5 
     maxBufferSize: 5m
     forcedFlushWindowSize: 10s
   flush:

diff --git a/src/aggregator/aggregator/aggregator.go b/src/aggregator/aggregator/aggregator.go
@@ -23,7 +23,6 @@ package aggregator
 import (
 	"context"
 	"errors"
-	"fmt"
 	"math"
 	"strconv"
 	"sync"
@@ -94,20 +93,6 @@ type Aggregator interface {
 	Close() error
 }
 
-type tickShardFn func(
-	shard *aggregatorShard,
-	perShardTickDuration time.Duration,
-	doneCh <-chan struct{},
-) tickResult
-
-func tickShard(
-	shard *aggregatorShard,
-	perShardTickDuration time.Duration,
-	doneCh <-chan struct{},
-) tickResult {
-	return shard.Tick(perShardTickDuration, doneCh)
-}
-
 // aggregator stores aggregations of different types of metrics (e.g., counter,
 // timer, gauges) and periodically flushes them out.
 type aggregator struct {
@@ -116,7 +101,6 @@ type aggregator struct {
 	opts              Options
 	nowFn             clock.NowFn
 	shardFn           sharding.ShardFn
-	tickShardFn       tickShardFn
 	checkInterval     time.Duration
 	placementManager  PlacementManager
 	flushTimesManager FlushTimesManager
@@ -135,8 +119,7 @@ type aggregator struct {
 	currPlacement      placement.Placement
 	currNumShards      atomic.Int32
 	state              aggregatorState
-	doneCh             chan struct{}
-	wg                 sync.WaitGroup
+	sleepFn            sleepFn
 	shardsPendingClose atomic.Int32
 	metrics            aggregatorMetrics
 	logger             *zap.Logger
@@ -163,10 +146,9 @@ func NewAggregator(opts Options) Aggregator {
 		passthroughWriter: opts.PassthroughWriter(),
 		adminClient:       opts.AdminClient(),
 		resignTimeout:     opts.ResignTimeout(),
-		doneCh:            make(chan struct{}),
+		sleepFn:           time.Sleep,
 		metrics:           newAggregatorMetrics(scope, timerOpts, opts.MaxAllowedForwardingDelayFn()),
 		logger:            logger,
-		tickShardFn:       tickShard,
 	}
 
 	return agg
@@ -190,19 +172,21 @@ func (agg *aggregator) Open() error {
 		return err
 	}
 	if agg.checkInterval > 0 {
-		agg.wg.Add(1)
 		go agg.tick()
 	}
 
-	agg.wg.Add(1)
+	// NB: placement tick watches the placement manager, and initializes a
+	// topology change if the placement is updated. This changes which shards this
+	// aggregator is responsible for, and initiates leader elections. In the
+	// scenario where a placement change is received when this aggregator is
+	// closed, it's fine to ignore the result of the placement update, as applying
+	// the change only affects the current aggregator that is being closed anyway.
 	go agg.placementTick()
 	agg.state = aggregatorOpen
 	return nil
 }
 
 func (agg *aggregator) placementTick() {
-	defer agg.wg.Done()
-
 	ticker := time.NewTicker(placementCheckInterval)
 	defer ticker.Stop()
 
@@ -212,8 +196,6 @@ func (agg *aggregator) placementTick() {
 		select {
 		case <-ticker.C:
 		case <-agg.placementManager.C():
-		case <-agg.doneCh:
-			return
 		}
 
 		placement, err := agg.placementManager.Placement()
@@ -402,53 +384,12 @@ func (agg *aggregator) Close() error {
 	}
 	agg.state = aggregatorClosed
 
-	var (
-		lastOpCompleted = time.Now()
-		closeLogger     = agg.logger.With(zap.String("closing", "aggregator"))
-
-		logCloseOperation = func(op string) {
-			currTime := time.Now()
-			closeLogger.Debug(fmt.Sprintf("closed %s", op),
-				zap.String("took", currTime.Sub(lastOpCompleted).String()))
-			lastOpCompleted = currTime
-		}
-	)
-
-	closeLogger.Info("signaling aggregator done")
-	close(agg.doneCh)
-
-	// Waiting for the ticking goroutines to return.
-	// Doing this outside of agg.Lock to avoid potential deadlocks.
-	agg.Unlock()
-	agg.wg.Wait()
-	agg.Lock()
-
-	logCloseOperation("ticking wait groups")
-	for _, shardID := range agg.shardIDs {
-		agg.shards[shardID].Close()
-	}
-
-	logCloseOperation("aggregator shards")
-	if agg.shardSetOpen {
-		agg.closeShardSetWithLock()
-	}
-
-	logCloseOperation("flush shard sets")
-
-	agg.flushHandler.Close()
-	logCloseOperation("flush handler")
-
-	agg.passthroughWriter.Close()
-	logCloseOperation("passthrough writer")
-
-	if agg.adminClient != nil {
-		closeLogger.Info("closing admin client")
-		agg.adminClient.Close()
-		logCloseOperation("admin client")
-	}
-
-	closeLogger.Info("done")
-	return nil
+	// NB: closing the flush manager is the only really necessary step for
+	// gracefully closing an aggregator leader, as this will ensure that any
+	// currently running flush completes, and updates the shared shard flush
+	// times map in etcd, allowing the follower that will be promoted to leader
+	// to avoid re-computing and re-flushing this data.
+	return agg.flushManager.Close()
 }
 
 func (agg *aggregator) shardFor(id id.RawID) (*aggregatorShard, error) {
@@ -486,7 +427,8 @@ func (agg *aggregator) shardFor(id id.RawID) (*aggregatorShard, error) {
 func (agg *aggregator) processPlacementWithLock(
 	newPlacement placement.Placement,
 ) error {
-	// If someone has already processed the placement ahead of us, do nothing.
+	// If someone has already processed the placement ahead of us, or if the
+	// aggregator was closed before the placement update started, do nothing.
 	if !agg.shouldProcessPlacementWithLock(newPlacement) {
 		return nil
 	}
@@ -533,6 +475,10 @@ func (agg *aggregator) processPlacementWithLock(
 func (agg *aggregator) shouldProcessPlacementWithLock(
 	newPlacement placement.Placement,
 ) bool {
+	if agg.state == aggregatorClosed {
+		return false
+	}
+
 	// If there is no placement yet, or the placement has been updated,
 	// process this placement.
 	if agg.currPlacement == nil || agg.currPlacement != newPlacement {
@@ -757,15 +703,8 @@ func (agg *aggregator) closeShardsAsync(shards []*aggregatorShard) {
 }
 
 func (agg *aggregator) tick() {
-	defer agg.wg.Done()
-
 	for {
-		select {
-		case <-agg.doneCh:
-			return
-		default:
-			agg.tickInternal()
-		}
+		agg.tickInternal()
 	}
 }
 
@@ -777,7 +716,7 @@ func (agg *aggregator) tickInternal() {
 	agg.metrics.shards.owned.Update(float64(numShards))
 	agg.metrics.shards.pendingClose.Update(float64(agg.shardsPendingClose.Load()))
 	if numShards == 0 {
-		agg.waitForInterval(agg.checkInterval)
+		agg.sleepFn(agg.checkInterval)
 		return
 	}
 	var (
@@ -786,29 +725,13 @@ func (agg *aggregator) tickInternal() {
 		tickResult           tickResult
 	)
 	for _, shard := range ownedShards {
-		select {
-		// NB: if doneCh has been signaled, no need to continue ticking shards and
-		// it's valid to early abort ticking.
-		case <-agg.doneCh:
-			agg.logger.Info("recevied interrupt on tick; aborting")
-			return
-		default:
-		}
-
-		shardTickResult := agg.tickShardFn(shard, perShardTickDuration, agg.doneCh)
+		shardTickResult := shard.Tick(perShardTickDuration)
 		tickResult = tickResult.merge(shardTickResult)
 	}
 	tickDuration := agg.nowFn().Sub(start)
 	agg.metrics.tick.Report(tickResult, tickDuration)
 	if tickDuration < agg.checkInterval {
-		agg.waitForInterval(agg.checkInterval - tickDuration)
-	}
-}
-
-func (agg *aggregator) waitForInterval(wait time.Duration) {
-	select {
-	case <-agg.doneCh:
-	case <-time.After(wait):
+		agg.sleepFn(agg.checkInterval - tickDuration)
 	}
 }
 

diff --git a/src/aggregator/aggregator/aggregator_mock.go b/src/aggregator/aggregator/aggregator_mock.go
diff --git a/src/aggregator/aggregator/aggregator_test.go b/src/aggregator/aggregator/aggregator_test.go
@@ -841,44 +841,6 @@ func TestAggregatorTick(t *testing.T) {
 	require.NoError(t, agg.Close())
 }
 
-func TestAggregatorTickCancelled(t *testing.T) {
-	ctrl := gomock.NewController(t)
-	defer ctrl.Finish()
-
-	flushTimesManager := NewMockFlushTimesManager(ctrl)
-	flushTimesManager.EXPECT().Reset().Return(nil).AnyTimes()
-	flushTimesManager.EXPECT().Open(gomock.Any()).Return(nil).AnyTimes()
-	flushTimesManager.EXPECT().Get().Return(nil, nil).AnyTimes()
-	flushTimesManager.EXPECT().Close().Return(nil).AnyTimes()
-
-	agg, _ := testAggregator(t, ctrl)
-	agg.flushTimesManager = flushTimesManager
-	require.NoError(t, agg.Open())
-
-	var (
-		tickedCh       = make(chan struct{})
-		numTicked      = 0
-		doneAfterTicks = 2
-	)
-
-	agg.tickShardFn = func(*aggregatorShard, time.Duration, <-chan struct{}) tickResult {
-		numTicked++
-		if doneAfterTicks == 2 {
-			close(tickedCh)
-		}
-
-		time.Sleep(time.Millisecond * 50)
-		return tickResult{}
-	}
-
-	go func() {
-		<-tickedCh
-		require.NoError(t, agg.Close())
-	}()
-
-	require.Equal(t, 2, doneAfterTicks)
-}
-
 func TestAggregatorShardSetNotOpenNilInstance(t *testing.T) {
 	ctrl := gomock.NewController(t)
 	defer ctrl.Finish()

diff --git a/src/aggregator/aggregator/flush_mgr_options.go b/src/aggregator/aggregator/flush_mgr_options.go
@@ -31,16 +31,13 @@ import (
 )
 
 const (
-	defaultCheckEvery             = time.Second
-	defaultJitterEnabled          = true
-	defaultFlushTimesPersistEvery = 10 * time.Second
-	defaultMaxBufferSize          = 5 * time.Minute
-	defaultForcedFlushWindowSize  = 10 * time.Second
+	defaultCheckEvery            = time.Second
+	defaultJitterEnabled         = true
+	defaultMaxBufferSize         = 5 * time.Minute
+	defaultForcedFlushWindowSize = 10 * time.Second
 )
 
-var (
-	defaultWorkerPoolSize = int(math.Max(float64(runtime.GOMAXPROCS(0)/8), 1.0))
-)
+var defaultWorkerPoolSize = int(math.Max(float64(runtime.GOMAXPROCS(0)/8), 1.0))
 
 // FlushJitterFn determines the jitter based on the flush interval.
 type FlushJitterFn func(flushInterval time.Duration) time.Duration
@@ -152,14 +149,13 @@ func NewFlushManagerOptions() FlushManagerOptions {
 	workerPool := sync.NewWorkerPool(defaultWorkerPoolSize)
 	workerPool.Init()
 	return &flushManagerOptions{
-		clockOpts:              clock.NewOptions(),
-		instrumentOpts:         instrument.NewOptions(),
-		checkEvery:             defaultCheckEvery,
-		jitterEnabled:          defaultJitterEnabled,
-		workerPool:             workerPool,
-		flushTimesPersistEvery: defaultFlushTimesPersistEvery,
-		maxBufferSize:          defaultMaxBufferSize,
-		forcedFlushWindowSize:  defaultForcedFlushWindowSize,
+		clockOpts:             clock.NewOptions(),
+		instrumentOpts:        instrument.NewOptions(),
+		checkEvery:            defaultCheckEvery,
+		jitterEnabled:         defaultJitterEnabled,
+		workerPool:            workerPool,
+		maxBufferSize:         defaultMaxBufferSize,
+		forcedFlushWindowSize: defaultForcedFlushWindowSize,
 
 		bufferForPastTimedMetric: defaultTimedMetricBuffer,
 	}