From 0421624ec905ab9fc44c38f1601e046884b76152 Mon Sep 17 00:00:00 2001 From: Oleg Afanasyev Date: Wed, 19 Jul 2023 17:07:14 +0100 Subject: [PATCH] rangefeed: scheduler for rangefeed usage Scheduler adds a work scheduler that could enqueue events to be processed by callbacks using a fixed goroutine pool. By using scheduler, components can eliminate excessive handover of data between goroutines in per-range cases reducing go scheduler load and thus reducing CPU and responsiveness. Release note: None --- pkg/kv/kvserver/rangefeed/BUILD.bazel | 4 + pkg/kv/kvserver/rangefeed/scheduler.go | 514 ++++++++++++++++++++ pkg/kv/kvserver/rangefeed/scheduler_test.go | 464 ++++++++++++++++++ 3 files changed, 982 insertions(+) create mode 100644 pkg/kv/kvserver/rangefeed/scheduler.go create mode 100644 pkg/kv/kvserver/rangefeed/scheduler_test.go diff --git a/pkg/kv/kvserver/rangefeed/BUILD.bazel b/pkg/kv/kvserver/rangefeed/BUILD.bazel index 1d421e146057..751da9878cb5 100644 --- a/pkg/kv/kvserver/rangefeed/BUILD.bazel +++ b/pkg/kv/kvserver/rangefeed/BUILD.bazel @@ -10,6 +10,7 @@ go_library( "processor.go", "registry.go", "resolved_timestamp.go", + "scheduler.go", "task.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed", @@ -24,6 +25,7 @@ go_library( "//pkg/storage/enginepb", "//pkg/util/admission", "//pkg/util/bufalloc", + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/future", "//pkg/util/hlc", @@ -52,6 +54,7 @@ go_test( "processor_test.go", "registry_test.go", "resolved_timestamp_test.go", + "scheduler_test.go", "task_test.go", ], args = ["-test.timeout=895s"], @@ -85,5 +88,6 @@ go_test( "@com_github_cockroachdb_pebble//vfs", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", + "@org_golang_x_exp//slices", ], ) diff --git a/pkg/kv/kvserver/rangefeed/scheduler.go b/pkg/kv/kvserver/rangefeed/scheduler.go new file mode 100644 index 000000000000..759ca7c4c7ad --- /dev/null +++ b/pkg/kv/kvserver/rangefeed/scheduler.go @@ -0,0 +1,514 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package rangefeed + +import ( + "context" + "fmt" + "strings" + "sync" + + "github.com/cockroachdb/cockroach/pkg/util/buildutil" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/cockroachdb/errors" +) + +// Scheduler is used by rangefeed processors to schedule work to a pool of +// workers instead of running individual goroutines per range. Store will run +// a single scheduler for all its rangefeeds. +// +// When processor is started it registers a callback with scheduler. +// After that processor can enqueue work for itself by telling scheduler what +// types of events it plans to process. +// +// Scheduler maintains a queue of processor ids that wish to do processing and +// notifies callbacks in order passing them union of event types that were +// enqueued since last notification. + +// processorEventType is a mask for pending events for the processor. All event +// types that were enqueued between two callback invocations are coalesced into +// a single value. +type processorEventType int + +const ( + // queued is an internal event type that indicate that there's already a + // pending work for processor and it is already scheduled for execution. + // When more events types come in, they should just be added to existing + // pending value. + queued processorEventType = 1 << iota + // stopped is an event that indicates that there would be no more events + // scheduled for the processor. once it is enqueued, all subsequent events + // are rejected. processor should perform any cleanup when receiving this + // event that it needs to perform within callback context. + stopped + // numProcessorEventTypes is total number of event types. + numProcessorEventTypes int = iota +) + +var eventNames = map[processorEventType]string{ + queued: "Queued", + stopped: "Stopped", +} + +func (e processorEventType) String() string { + var evts []string + for i := 0; i < numProcessorEventTypes; i++ { + if eventType := processorEventType(1 << i); eventType&e != 0 { + evts = append(evts, eventNames[eventType]) + } + } + return strings.Join(evts, " | ") +} + +// enqueueBulkMaxChunk is max number of event enqueued in one go while holding +// scheduler lock. +const enqueueBulkMaxChunk = 100 + +// Callback is a callback to perform work set by processor. Event is a +// combination of all event types scheduled since last callback invocation. +// +// Once callback returns, event types considered to be processed. If a processor +// decided not to process everything, it can return remaining types which would +// instruct scheduler to re-enqueue processor. +// +// This mechanism allows processors to throttle processing if it has too much +// pending data to process in one go without blocking other processors. +type Callback func(event processorEventType) (remaining processorEventType) + +// SchedulerConfig contains configurable scheduler parameters. +type SchedulerConfig struct { + // Workers is the number of pool workers for scheduler to use. + Workers int + // BulkChunkSize is number of ids that would be enqueued in a single bulk + // enqueue operation. Chunking is done to avoid holding locks for too long + // as it will interfere with enqueue operations. + BulkChunkSize int +} + +// Scheduler is a simple scheduler that allows work to be scheduler +// against number of processors. Each processor is represented by unique id and +// a callback. +// +// Work is enqueued in a form of event type using processor id. +// Processors callback is then called by worker thread with all combined pending +// events. +// +// Each event is represented as a bit mask and multiple pending events could be +// ORed together before being delivered to processor. +type Scheduler struct { + SchedulerConfig + + mu struct { + syncutil.Mutex + nextID int64 + procs map[int64]Callback + status map[int64]processorEventType + queue *idQueue + // No more new registrations allowed. Workers are winding down. + quiescing bool + } + cond *sync.Cond + wg sync.WaitGroup +} + +// NewScheduler will instantiate an idle scheduler based on provided config. +// Scheduler needs to be started to become operational. +func NewScheduler(cfg SchedulerConfig) *Scheduler { + if cfg.BulkChunkSize == 0 { + cfg.BulkChunkSize = enqueueBulkMaxChunk + } + s := &Scheduler{ + SchedulerConfig: cfg, + wg: sync.WaitGroup{}, + } + s.mu.procs = make(map[int64]Callback) + s.mu.status = make(map[int64]processorEventType) + s.mu.queue = newIDQueue() + s.cond = sync.NewCond(&s.mu) + return s +} + +// Start scheduler workers. +func (s *Scheduler) Start(ctx context.Context, stopper *stop.Stopper) error { + for i := 0; i < s.Workers; i++ { + s.wg.Add(1) + workerID := i + if err := stopper.RunAsyncTask(ctx, fmt.Sprintf("rangefeed-scheduler-worker-%d", workerID), + func(ctx context.Context) { + log.VEventf(ctx, 3, "%d scheduler worker started", workerID) + defer s.wg.Done() + s.processEvents(ctx) + log.VEventf(ctx, 3, "%d scheduler worker finished", workerID) + }); err != nil { + s.wg.Done() + s.Stop() + return err + } + } + if err := stopper.RunAsyncTask(ctx, "terminate scheduler", + func(ctx context.Context) { + <-stopper.ShouldQuiesce() + log.VEvent(ctx, 2, "scheduler quiescing") + s.Stop() + }); err != nil { + s.Stop() + return err + } + return nil +} + +// Register callback to be able to schedule work. Returns allocated callback id +// which should be used to send notifications to the callback. Returns error if +// Scheduler is stopped. +func (s *Scheduler) Register(f Callback) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.mu.quiescing { + // Don't accept new registrations if quiesced. + return 0, errors.New("server stopping") + } + s.mu.nextID++ + id := s.mu.nextID + s.mu.procs[id] = f + return id, nil +} + +// Enqueue event for existing callback. Returns error if callback was not +// registered for the id or if processor is stopping. Error doesn't guarantee +// that processor actually handled stopped event it may either be pending or +// processed. +func (s *Scheduler) Enqueue(id int64, evt processorEventType) { + s.mu.Lock() + defer s.mu.Unlock() + if _, ok := s.mu.procs[id]; !ok { + return + } + newWork := s.enqueueInternalLocked(id, evt) + if newWork { + // Wake up potential waiting worker. + // We are allowed to do this under cond lock. + s.cond.Signal() + } +} + +func (s *Scheduler) enqueueInternalLocked(id int64, evt processorEventType) bool { + pending := s.mu.status[id] + if pending&stopped != 0 { + return false + } + if pending == 0 { + // Enqueue if processor was idle. + s.mu.queue.pushBack(id) + } + update := pending | evt | queued + if update != pending { + // Only update if event actually changed. + s.mu.status[id] = update + } + return pending == 0 +} + +// EnqueueAll enqueues event for all existing non-stopped id's. Enqueueing is +// done in chunks to avoid holding lock for too long and interfering with other +// enqueue operations. +// +// If id is not known or already stopped it is ignored. +func (s *Scheduler) EnqueueAll(ids []int64, evt processorEventType) { + scheduleChunk := func(chunk []int64) int { + s.mu.Lock() + defer s.mu.Unlock() + wake := 0 + for _, id := range chunk { + if _, ok := s.mu.procs[id]; ok { + if newWork := s.enqueueInternalLocked(id, evt); newWork { + wake++ + } + } + } + return wake + } + wake := 0 + total := len(ids) + for first := 0; first < total; first += s.BulkChunkSize { + last := first + s.BulkChunkSize + if last > total { + last = total + } + added := scheduleChunk(ids[first:last]) + wake += added + } + // Wake up potential waiting workers. We wake all of them as we expect more + // than total number of workers. + if wake >= s.Workers { + s.cond.Broadcast() + } else { + for ; wake > 0; wake-- { + s.cond.Signal() + } + } +} + +// StopProcessor instructs processor to stop gracefully by sending it stopped event. +// Once stop is called all subsequent Schedule calls for this id will return +// error. +func (s *Scheduler) StopProcessor(id int64) { + s.Enqueue(id, stopped) +} + +// processEvents is a main worker method of a scheduler pool. each one should +// be launched in separate goroutine and will loop until scheduler is stopped. +func (s *Scheduler) processEvents(ctx context.Context) { + for { + var id int64 + s.mu.Lock() + for { + if s.mu.quiescing { + s.mu.Unlock() + return + } + var ok bool + if id, ok = s.mu.queue.popFront(); ok { + break + } + s.cond.Wait() + } + + cb := s.mu.procs[id] + e := s.mu.status[id] + // Keep queued status and preserve stopped to block any more events. + s.mu.status[id] = queued | (e & stopped) + s.mu.Unlock() + + procEventType := queued ^ e + remaining := cb(procEventType) + + if remaining != 0 && buildutil.CrdbTestBuild { + if (remaining^procEventType)&remaining != 0 { + log.Fatalf(ctx, + "rangefeed processor attempted to reschedule event type %s that was not present in original event set %s", + procEventType, remaining) + } + } + + if e&stopped != 0 { + if remaining != 0 { + log.VWarningf(ctx, 5, + "rangefeed processor %d didn't process all events on close", id) + } + // We'll keep stopped state to avoid calling stopped processor again + // on scheduler shutdown. + s.mu.Lock() + s.mu.status[id] = stopped + s.mu.Unlock() + continue + } + + s.mu.Lock() + pendingStatus, ok := s.mu.status[id] + if !ok { + s.mu.Unlock() + continue + } + newStatus := pendingStatus | remaining + if newStatus == queued { + // If no events arrived, get rid of id. + delete(s.mu.status, id) + } else { + // Since more events arrived during processing, reschedule. + s.mu.queue.pushBack(id) + // If remaining work was returned and not already planned, then update + // pending status to reflect that. + if newStatus != pendingStatus { + s.mu.status[id] = newStatus + } + } + s.mu.Unlock() + } +} + +// Unregister a processor. This function is removing processor callback and +// status from scheduler. If processor is currently processing event it will +// finish processing. +// Processor won't receive stopped event if it wasn't explicitly sent. +// To make sure processor performs cleanup, it is easier to send it stopped +// event first and let it remove itself from registration during event handling. +// Any attempts to enqueue events for processor after this call will return an +// error. +func (s *Scheduler) Unregister(id int64) { + s.mu.Lock() + defer s.mu.Unlock() + + delete(s.mu.procs, id) + delete(s.mu.status, id) +} + +func (s *Scheduler) Stop() { + // Stop all processors. + s.mu.Lock() + if !s.mu.quiescing { + // On first close attempt trigger termination of all unfinished callbacks, + // we only need to do that once to avoid closing s.drained channel multiple + // times. + s.mu.quiescing = true + } + s.mu.Unlock() + s.cond.Broadcast() + s.wg.Wait() + + // Synchronously notify all non-stopped processors about stop. + s.mu.Lock() + for id, p := range s.mu.procs { + pending := s.mu.status[id] + // Ignore processors that already processed their stopped event. + if pending == stopped { + continue + } + // Add stopped event on top of what was pending and remove queued. + pending = (^queued & pending) | stopped + s.mu.Unlock() + p(pending) + s.mu.Lock() + } + s.mu.Unlock() +} + +// ClientScheduler is a wrapper on top of scheduler that could be passed to a +// processor to be able to register itself with a pre-configured ID, enqueue +// events and terminate as needed. +type ClientScheduler struct { + id int64 + s *Scheduler +} + +// NewClientScheduler creates an instance of ClientScheduler for specific id. +// It is safe to use it as value as it is immutable and delegates all work to +// underlying scheduler. +func NewClientScheduler(s *Scheduler) ClientScheduler { + return ClientScheduler{ + s: s, + } +} + +// ID returns underlying callback id used to schedule work. +func (cs *ClientScheduler) ID() int64 { + return cs.id +} + +// Register registers processing callback in scheduler. Error is returned if +// callback was already registered for this ClientScheduler or if scheduler is +// already quiescing. +func (cs *ClientScheduler) Register(cb Callback) error { + if cs.id != 0 { + return errors.Newf("callback is already registered with id %d", cs.id) + } + var err error + cs.id, err = cs.s.Register(cb) + return err +} + +// Schedule schedules callback for event. Error is returned if client callback +// wasn't registered prior to this call. +func (cs *ClientScheduler) Schedule(event processorEventType) { + cs.s.Enqueue(cs.id, event) +} + +// Stop instructs processor to stop gracefully by sending it stopped event. +// Once stop is called all subsequent Schedule calls will return error. +func (cs *ClientScheduler) Stop() { + cs.s.StopProcessor(cs.id) +} + +// Unregister will remove callback associated with this processor. No stopped +// event will be scheduled. See Scheduler.Unregister for details. +func (cs *ClientScheduler) Unregister() { + cs.s.Unregister(cs.id) +} + +// Number of queue elements allocated at once to amortize queue allocations. +const idQueueChunkSize = 8000 + +// idQueueChunk is a queue chunk of a fixed size which idQueue uses to extend +// its storage. Chunks are kept in the pool to reduce allocations. +type idQueueChunk struct { + data [idQueueChunkSize]int64 + nextChunk *idQueueChunk +} + +var sharedIDQueueChunkSyncPool = sync.Pool{ + New: func() interface{} { + return new(idQueueChunk) + }, +} + +func getPooledIDQueueChunk() *idQueueChunk { + return sharedIDQueueChunkSyncPool.Get().(*idQueueChunk) +} + +func putPooledIDQueueChunk(e *idQueueChunk) { + // Don't need to cleanup chunk as it is an array of values. + e.nextChunk = nil + sharedIDQueueChunkSyncPool.Put(e) +} + +// idQueue stores pending processor ID's. Internally data is stored in +// idQueueChunkSize sized arrays that are added as needed and discarded once +// reader and writers finish working with it. Since we only have a single +// scheduler per store, we don't use a pool as only reuse could happen within +// the same queue and in that case we can just increase chunk size. +type idQueue struct { + first, last *idQueueChunk + read, write int + size int +} + +func newIDQueue() *idQueue { + chunk := getPooledIDQueueChunk() + return &idQueue{ + first: chunk, + last: chunk, + read: 0, + size: 0, + } +} + +func (q *idQueue) pushBack(id int64) { + if q.write == idQueueChunkSize { + nexChunk := getPooledIDQueueChunk() + q.last.nextChunk = nexChunk + q.last = nexChunk + q.write = 0 + } + q.last.data[q.write] = id + q.write++ + q.size++ +} + +func (q *idQueue) popFront() (int64, bool) { + if q.size == 0 { + return 0, false + } + if q.read == idQueueChunkSize { + removed := q.first + q.first = q.first.nextChunk + putPooledIDQueueChunk(removed) + q.read = 0 + } + res := q.first.data[q.read] + q.read++ + q.size-- + return res, true +} + +func (q *idQueue) Len() int { + return q.size +} diff --git a/pkg/kv/kvserver/rangefeed/scheduler_test.go b/pkg/kv/kvserver/rangefeed/scheduler_test.go new file mode 100644 index 000000000000..b9a3508afed4 --- /dev/null +++ b/pkg/kv/kvserver/rangefeed/scheduler_test.go @@ -0,0 +1,464 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package rangefeed + +import ( + "context" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/stretchr/testify/require" + "golang.org/x/exp/slices" +) + +func TestStopEmpty(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + s.Stop() + + assertStopsWithinTimeout(t, s) +} + +func TestStopNonEmpty(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + s.StopProcessor(c.id) + assertStopsWithinTimeout(t, s) + c.requireStopped(t, time.Second*30) +} + +type schedulerConsumer struct { + c chan processorEventType + mu struct { + syncutil.RWMutex + wait chan interface{} + waiting chan interface{} + } + reschedule chan processorEventType + flat []processorEventType + sched *Scheduler + id int64 +} + +func createAndRegisterConsumerOrFail(t *testing.T, scheduler *Scheduler) *schedulerConsumer { + t.Helper() + c := &schedulerConsumer{ + c: make(chan processorEventType, 1000), + reschedule: make(chan processorEventType, 1), + sched: scheduler, + } + id, err := c.sched.Register(c.process) + require.NoError(t, err, "failed to register processor") + c.id = id + return c +} + +func (c *schedulerConsumer) process(ev processorEventType) processorEventType { + c.c <- ev + c.mu.RLock() + w, ww := c.mu.wait, c.mu.waiting + c.mu.RUnlock() + if w != nil { + close(ww) + <-w + } + select { + case r := <-c.reschedule: + // Tests don't try to do reschedule and stop at the same time, so it's ok + // not to fall through. + return r + default: + } + if ev&stopped != 0 { + c.sched.Unregister(c.id) + } + return 0 +} + +func (c *schedulerConsumer) pause() { + c.mu.Lock() + c.mu.wait = make(chan interface{}) + c.mu.waiting = make(chan interface{}) + c.mu.Unlock() +} + +func (c *schedulerConsumer) waitPaused() { + <-c.mu.waiting +} + +// Close waiter channel. Test should track state itself and don't use resume if +// pause was not issued. +func (c *schedulerConsumer) resume() { + c.mu.Lock() + w := c.mu.wait + c.mu.wait, c.mu.waiting = nil, nil + c.mu.Unlock() + close(w) +} + +func (c *schedulerConsumer) rescheduleNext(e processorEventType) { + c.reschedule <- e +} + +func (c *schedulerConsumer) assertTill( + t *testing.T, timeout time.Duration, assert func(flat []processorEventType) bool, +) bool { + t.Helper() + till := time.After(timeout) + for { + if assert(c.flat) { + return true + } + select { + case <-till: + return false + case e := <-c.c: + c.flat = append(c.flat, e) + } + } +} + +func (c *schedulerConsumer) requireEvent( + t *testing.T, timeout time.Duration, event processorEventType, count ...int, +) { + t.Helper() + min, max := 0, 0 + l := len(count) + switch { + case l == 1: + min, max = count[0], count[0] + case l == 2: + min, max = count[0], count[1] + default: + t.Fatal("event count limits must be 1 (exact) or 2 [mix, max]") + } + var lastHist []processorEventType + if !c.assertTill(t, timeout, func(flat []processorEventType) bool { + lastHist = flat + match := 0 + for _, e := range lastHist { + if e&event != 0 { + match++ + } + } + return match >= min && match <= max + }) { + t.Fatalf("failed to find event %08b between %d and %d times in history %08b", event, min, max, + lastHist) + } +} + +func (c *schedulerConsumer) requireHistory( + t *testing.T, timeout time.Duration, history []processorEventType, +) { + t.Helper() + var lastHist []processorEventType + if !c.assertTill(t, timeout, func(flat []processorEventType) bool { + lastHist = flat + return slices.Equal(history, lastHist) + }) { + t.Fatalf("expected history %08b found %08b", history, lastHist) + } +} + +func (c *schedulerConsumer) requireStopped(t *testing.T, timeout time.Duration) { + t.Helper() + lastEvent := processorEventType(0) + if !c.assertTill(t, timeout, func(flat []processorEventType) bool { + t.Helper() + if len(c.flat) == 0 { + return false + } + lastEvent = c.flat[len(c.flat)-1] + return lastEvent&stopped != 0 + }) { + t.Fatalf("failed to find stopped event at the end of history after %s, lastEvent=%08b", timeout, + lastEvent) + } +} + +const ( + te1 = 1 << 2 + te2 = 1 << 3 + te3 = 1 << 4 +) + +func TestDeliverEvents(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + s.Enqueue(c.id, te1) + c.requireEvent(t, time.Second*30000, te1, 1) + assertStopsWithinTimeout(t, s) +} + +func TestNoParallel(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + c.pause() + s.Enqueue(c.id, te1) + c.waitPaused() + s.Enqueue(c.id, te2) + c.resume() + c.requireHistory(t, time.Second*30, []processorEventType{te1, te2}) + assertStopsWithinTimeout(t, s) +} + +func TestProcessOtherWhilePaused(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c1 := createAndRegisterConsumerOrFail(t, s) + c2 := createAndRegisterConsumerOrFail(t, s) + c1.pause() + s.Enqueue(c1.id, te1) + c1.waitPaused() + s.Enqueue(c2.id, te1) + c2.requireHistory(t, time.Second*30, []processorEventType{te1}) + c1.resume() + c1.requireHistory(t, time.Second*30, []processorEventType{te1}) + assertStopsWithinTimeout(t, s) + c1.requireStopped(t, time.Second*30) + c2.requireStopped(t, time.Second*30) +} + +func TestEventsCombined(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + c.pause() + s.Enqueue(c.id, te1) + c.waitPaused() + s.Enqueue(c.id, te2) + s.Enqueue(c.id, te3) + c.resume() + c.requireHistory(t, time.Second*30, []processorEventType{te1, te2 | te3}) + assertStopsWithinTimeout(t, s) +} + +func TestRescheduleEvent(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + c.pause() + s.Enqueue(c.id, te1) + c.waitPaused() + s.Enqueue(c.id, te1) + c.resume() + c.requireHistory(t, time.Second*30, []processorEventType{te1, te1}) + assertStopsWithinTimeout(t, s) +} + +func TestClientScheduler(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + cs := NewClientScheduler(s) + // Manually create consumer as we don't want it to start, but want to use it + // via client scheduler. + c := &schedulerConsumer{ + c: make(chan processorEventType, 1000), + reschedule: make(chan processorEventType, 1), + sched: s, + id: 1, + } + require.NoError(t, cs.Register(c.process), "failed to register consumer") + require.Error(t, + cs.Register(func(event processorEventType) (remaining processorEventType) { return 0 }), + "reregistration must fail") + c.pause() + cs.Schedule(te2) + c.waitPaused() + cs.Unregister() + c.resume() + c.requireHistory(t, time.Second*30, []processorEventType{te2}) + assertStopsWithinTimeout(t, s) +} + +func TestScheduleMultiple(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 2, BulkChunkSize: 2}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + const consumerNumber = 10 + consumers := make([]*schedulerConsumer, consumerNumber) + ids := make([]int64, consumerNumber) + for i := 0; i < consumerNumber; i++ { + consumers[i] = createAndRegisterConsumerOrFail(t, s) + ids[i] = consumers[i].id + } + s.EnqueueAll(ids, te1) + for _, c := range consumers { + c.requireEvent(t, time.Second*30000, te1, 1) + } + assertStopsWithinTimeout(t, s) +} + +func TestPartialProcessing(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + // Set process response to trigger process once again. + c.rescheduleNext(te1) + s.Enqueue(c.id, te1) + c.requireHistory(t, time.Second*30, []processorEventType{te1, te1}) + assertStopsWithinTimeout(t, s) +} + +func assertStopsWithinTimeout(t *testing.T, s *Scheduler) { + stopC := make(chan interface{}) + go func() { + s.Stop() + close(stopC) + }() + select { + case <-stopC: + case <-time.After(30 * time.Second): + t.Fatalf("scheduler failed to stop after 30 seconds") + } +} + +func TestUnregisterWithoutStop(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c := createAndRegisterConsumerOrFail(t, s) + s.Enqueue(c.id, te1) + c.requireHistory(t, time.Second*30, []processorEventType{te1}) + s.Unregister(c.id) + assertStopsWithinTimeout(t, s) + // Ensure that we didn't send stop after callback was removed. + c.requireHistory(t, time.Second*30, []processorEventType{te1}) +} + +func TestStartupFailure(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.Error(t, s.Start(ctx, stopper), "started despite stopper stopped") +} + +func TestSchedulerShutdown(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + s := NewScheduler(SchedulerConfig{Workers: 1}) + require.NoError(t, s.Start(ctx, stopper), "failed to start") + c1 := createAndRegisterConsumerOrFail(t, s) + c2 := createAndRegisterConsumerOrFail(t, s) + s.StopProcessor(c2.id) + s.Stop() + // Ensure that we are not stopped twice. + c1.requireHistory(t, time.Second*30, []processorEventType{stopped}) + c2.requireHistory(t, time.Second*30, []processorEventType{stopped}) +} + +func TestQueueReadWrite1By1(t *testing.T) { + q := newIDQueue() + val := int64(7) + for i := 0; i < idQueueChunkSize*3; i++ { + q.pushBack(val) + require.Equal(t, 1, q.Len(), "queue size") + v, ok := q.popFront() + require.True(t, ok, "value not found after writing") + require.Equal(t, val, v, "read different from write") + val = val*3 + 7 + } + _, ok := q.popFront() + require.False(t, ok, "unexpected value after tail") +} + +func TestQueueReadWriteFull(t *testing.T) { + q := newIDQueue() + val := int64(7) + for i := 0; i < idQueueChunkSize*3; i++ { + require.Equal(t, i, q.Len(), "queue size") + q.pushBack(val) + val = val*3 + 7 + } + val = int64(7) + for i := 0; i < idQueueChunkSize*3; i++ { + require.Equal(t, idQueueChunkSize*3-i, q.Len(), "queue size") + v, ok := q.popFront() + require.True(t, ok, "value not found after writing") + require.Equal(t, val, v, "read different from write") + val = val*3 + 7 + } + require.Equal(t, 0, q.Len(), "queue size") + _, ok := q.popFront() + require.False(t, ok, "unexpected value after tail") +} + +func TestQueueReadEmpty(t *testing.T) { + q := newIDQueue() + _, ok := q.popFront() + require.False(t, ok, "unexpected value in empty queue") +}