-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
changefeed_processors.go
1896 lines (1656 loc) · 63 KB
/
changefeed_processors.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018 The Cockroach Authors.
//
// Licensed as a CockroachDB Enterprise file under the Cockroach Community
// License (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
package changefeedccl
import (
"context"
"fmt"
"time"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcutils"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvevent"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/schemafeed"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/logcrash"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/span"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
)
type changeAggregator struct {
execinfra.ProcessorBase
flowCtx *execinfra.FlowCtx
spec execinfrapb.ChangeAggregatorSpec
memAcc mon.BoundAccount
// cancel cancels the context passed to all resources created while starting
// this aggregator.
cancel func()
// errCh contains the return values of the kvfeed.
errCh chan error
// kvFeedDoneCh is closed when the kvfeed exits.
kvFeedDoneCh chan struct{}
// drainWatchCh is signaled if the job registry on this node is being
// drained, which is a proxy for the node being drained. If a drain occurs,
// it will be blocked until we allow it to proceed by calling drainDone().
// This gives the aggregator time to checkpoint before shutting down.
drainWatchCh <-chan struct{}
drainDone func()
// sink is the Sink to write rows to. Resolved timestamps are never written
// by changeAggregator.
sink EventSink
// changedRowBuf, if non-nil, contains changed rows to be emitted. Anything
// queued in `resolvedSpanBuf` is dependent on these having been emitted, so
// this one must be empty before moving on to that one.
changedRowBuf *encDatumRowBuffer
// resolvedSpanBuf contains resolved span updates to send to changeFrontier.
// If sink is a bufferSink, it must be emptied before these are sent.
resolvedSpanBuf encDatumRowBuffer
// lastPush records the time when we last pushed data to the coordinator.
lastPush time.Time
// shutdownCheckpointEmitted indicates if aggregator emitted checkpoint
// information during aggregator shutdown.
shutdownCheckpointEmitted bool
// recentKVCount contains the number of emits since the last time a resolved
// span was forwarded to the frontier
recentKVCount uint64
// eventProducer produces the next event from the kv feed.
eventProducer kvevent.Reader
// eventConsumer consumes the event.
eventConsumer eventConsumer
lastHighWaterFlush time.Time // last time high watermark was checkpointed.
flushFrequency time.Duration // how often high watermark can be checkpointed.
lastSpanFlush time.Time // last time expensive, span based checkpoint was written.
// frontier keeps track of resolved timestamps for spans along with schema change
// boundary information.
frontier *schemaChangeFrontier
metrics *Metrics
sliMetrics *sliMetrics
sliMetricsID int64
closeTelemetryRecorder func()
knobs TestingKnobs
}
type timestampLowerBoundOracle interface {
inclusiveLowerBoundTS() hlc.Timestamp
}
type changeAggregatorLowerBoundOracle struct {
sf *span.Frontier
initialInclusiveLowerBound hlc.Timestamp
}
// inclusiveLowerBoundTs is used to generate a representative timestamp to name
// cloudStorageSink files. This timestamp is either the statement time (in case this
// changefeed job hasn't yet seen any resolved timestamps) or the successor timestamp to
// the local span frontier. This convention is chosen to preserve CDC's ordering
// guarantees. See comment on cloudStorageSink for more details.
func (o *changeAggregatorLowerBoundOracle) inclusiveLowerBoundTS() hlc.Timestamp {
if frontier := o.sf.Frontier(); !frontier.IsEmpty() {
// We call `Next()` here on the frontier because this allows us
// to name files using a timestamp that is an inclusive lower bound
// on the timestamps of the updates contained within the file.
// Files being created at the point this method is called are guaranteed
// to contain row updates with timestamps strictly greater than the local
// span frontier timestamp.
return frontier.Next()
}
// This should only be returned in the case where the changefeed job hasn't yet
// seen a resolved timestamp.
return o.initialInclusiveLowerBound
}
var _ execinfra.Processor = &changeAggregator{}
var _ execinfra.RowSource = &changeAggregator{}
func newChangeAggregatorProcessor(
ctx context.Context,
flowCtx *execinfra.FlowCtx,
processorID int32,
spec execinfrapb.ChangeAggregatorSpec,
post *execinfrapb.PostProcessSpec,
) (execinfra.Processor, error) {
memMonitor := execinfra.NewMonitor(ctx, flowCtx.Mon, "changeagg-mem")
ca := &changeAggregator{
flowCtx: flowCtx,
spec: spec,
memAcc: memMonitor.MakeBoundAccount(),
}
if err := ca.Init(
ctx,
ca,
post,
changefeedResultTypes,
flowCtx,
processorID,
memMonitor,
execinfra.ProcStateOpts{
TrailingMetaCallback: func() []execinfrapb.ProducerMetadata {
ca.close()
return nil
},
},
); err != nil {
return nil, err
}
opts := changefeedbase.MakeStatementOptions(ca.spec.Feed.Opts)
// MinCheckpointFrequency controls how frequently the changeAggregator flushes the sink
// and checkpoints the local frontier to changeFrontier. It is used as a rough
// approximation of how latency-sensitive the changefeed user is. For a high latency
// user, such as cloud storage sink where flushes can take much longer, it is often set
// as the sink's flush frequency so as not to negate the sink's batch config.
//
// If a user does not specify a 'min_checkpoint_frequency' duration, we instead default
// to 30s, which is hopefully long enough to account for most possible sink latencies we
// could see without falling too behind.
//
// NB: As long as we periodically get new span-level resolved timestamps
// from the poller (which should always happen, even if the watched data is
// not changing), then this is sufficient and we don't have to do anything
// fancy with timers.
// // TODO(casper): add test for OptMinCheckpointFrequency.
checkpointFreq, err := opts.GetMinCheckpointFrequency()
if err != nil {
return nil, err
}
if checkpointFreq != nil {
ca.flushFrequency = *checkpointFreq
} else {
ca.flushFrequency = changefeedbase.DefaultMinCheckpointFrequency
}
return ca, nil
}
// MustBeStreaming implements the execinfra.Processor interface.
func (ca *changeAggregator) MustBeStreaming() bool {
return true
}
// wrapMetricsController wraps the supplied metricsRecorder to emit metrics to telemetry.
// This method modifies ca.cancel().
func (ca *changeAggregator) wrapMetricsController(
ctx context.Context, recorder metricsRecorder,
) (metricsRecorder, error) {
job, err := ca.flowCtx.Cfg.JobRegistry.LoadJob(ctx, ca.spec.JobID)
if err != nil {
return ca.sliMetrics, err
}
recorderWithTelemetry, err := wrapMetricsRecorderWithTelemetry(ctx, job, ca.flowCtx.Cfg.Settings, recorder)
if err != nil {
return ca.sliMetrics, err
}
ca.closeTelemetryRecorder = recorderWithTelemetry.close
return recorderWithTelemetry, nil
}
// Start is part of the RowSource interface.
func (ca *changeAggregator) Start(ctx context.Context) {
// Derive a separate context so that we can shutdown the poller.
ctx, ca.cancel = ca.flowCtx.Stopper().WithCancelOnQuiesce(ctx)
if ca.spec.JobID != 0 {
ctx = logtags.AddTag(ctx, "job", ca.spec.JobID)
}
ctx = ca.StartInternal(ctx, changeAggregatorProcName)
spans, err := ca.setupSpansAndFrontier()
feed := makeChangefeedConfigFromJobDetails(ca.spec.Feed)
opts := feed.Opts
if err != nil {
ca.MoveToDraining(err)
ca.cancel()
return
}
timestampOracle := &changeAggregatorLowerBoundOracle{
sf: ca.frontier.SpanFrontier(),
initialInclusiveLowerBound: feed.ScanTime,
}
if cfKnobs, ok := ca.flowCtx.TestingKnobs().Changefeed.(*TestingKnobs); ok {
ca.knobs = *cfKnobs
}
// The job registry has a set of metrics used to monitor the various jobs it
// runs. They're all stored as the `metric.Struct` interface because of
// dependency cycles.
ca.metrics = ca.flowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics)
scope, _ := opts.GetMetricScope()
ca.sliMetrics, err = ca.metrics.getSLIMetrics(scope)
if err != nil {
ca.MoveToDraining(err)
ca.cancel()
return
}
ca.sliMetricsID = ca.sliMetrics.claimId()
// TODO(jayant): add support for sinkless changefeeds using UUID
recorder := metricsRecorder(ca.sliMetrics)
if !ca.isSinkless() {
recorder, err = ca.wrapMetricsController(ctx, recorder)
if err != nil {
ca.MoveToDraining(err)
ca.cancel()
return
}
}
ca.sink, err = getEventSink(ctx, ca.flowCtx.Cfg, ca.spec.Feed, timestampOracle,
ca.spec.User(), ca.spec.JobID, recorder)
if err != nil {
err = changefeedbase.MarkRetryableError(err)
ca.MoveToDraining(err)
ca.cancel()
return
}
// This is the correct point to set up certain hooks depending on the sink
// type.
if b, ok := ca.sink.(*bufferSink); ok {
ca.changedRowBuf = &b.buf
}
// If the initial scan was disabled the highwater would've already been forwarded
needsInitialScan := ca.frontier.Frontier().IsEmpty()
// The "HighWater" of the KVFeed is the timestamp it will begin streaming
// change events from. When there's an inital scan, we want the scan to cover
// data up to the StatementTime and change events to begin from that point.
kvFeedHighWater := ca.frontier.Frontier()
if needsInitialScan {
kvFeedHighWater = ca.spec.Feed.StatementTime
}
// TODO(yevgeniy): Introduce separate changefeed monitor that's a parent
// for all changefeeds to control memory allocated to all changefeeds.
pool := ca.flowCtx.Cfg.BackfillerMonitor
if ca.knobs.MemMonitor != nil {
pool = ca.knobs.MemMonitor
}
limit := changefeedbase.PerChangefeedMemLimit.Get(&ca.flowCtx.Cfg.Settings.SV)
ca.eventProducer, ca.kvFeedDoneCh, ca.errCh, err = ca.startKVFeed(ctx, spans, kvFeedHighWater, needsInitialScan, feed, pool, limit)
if err != nil {
ca.MoveToDraining(err)
ca.cancel()
return
}
ca.sink = &errorWrapperSink{wrapped: ca.sink}
ca.eventConsumer, ca.sink, err = newEventConsumer(
ctx, ca.flowCtx.Cfg, ca.spec, feed, ca.frontier.SpanFrontier(), kvFeedHighWater,
ca.sink, ca.metrics, ca.sliMetrics, ca.knobs)
if err != nil {
ca.MoveToDraining(err)
ca.cancel()
return
}
// Init heartbeat timer.
ca.lastPush = timeutil.Now()
// Generate expensive checkpoint only after we ran for a while.
ca.lastSpanFlush = timeutil.Now()
if ca.knobs.OnDrain != nil {
ca.drainWatchCh = ca.knobs.OnDrain()
} else {
ca.drainWatchCh, ca.drainDone = ca.flowCtx.Cfg.JobRegistry.OnDrain()
}
}
// checkForNodeDrain returns an error if the node is draining.
func (ca *changeAggregator) checkForNodeDrain() error {
if ca.drainWatchCh == nil {
return errors.AssertionFailedf("cannot check for node drain if" +
" watch channel is nil")
}
select {
case <-ca.drainWatchCh:
return changefeedbase.ErrNodeDraining
default:
return nil
}
}
func (ca *changeAggregator) startKVFeed(
ctx context.Context,
spans []roachpb.Span,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
config ChangefeedConfig,
parentMemMon *mon.BytesMonitor,
memLimit int64,
) (kvevent.Reader, chan struct{}, chan error, error) {
cfg := ca.flowCtx.Cfg
kvFeedMemMon := mon.NewMonitorInheritWithLimit("kvFeed", memLimit, parentMemMon)
kvFeedMemMon.StartNoReserved(ctx, parentMemMon)
buf := kvevent.NewThrottlingBuffer(
kvevent.NewMemBuffer(kvFeedMemMon.MakeBoundAccount(), &cfg.Settings.SV, &ca.metrics.KVFeedMetrics),
cdcutils.NodeLevelThrottler(&cfg.Settings.SV, &ca.metrics.ThrottleMetrics))
// KVFeed takes ownership of the kvevent.Writer portion of the buffer, while
// we return the kvevent.Reader part to the caller.
kvfeedCfg, err := ca.makeKVFeedCfg(ctx, config, spans, buf, initialHighWater, needsInitialScan, kvFeedMemMon)
if err != nil {
return nil, nil, nil, err
}
errCh := make(chan error, 1)
doneCh := make(chan struct{})
// If RunAsyncTask immediately returns an error, the kvfeed was not run and
// will not run.
if err := ca.flowCtx.Stopper().RunAsyncTask(ctx, "changefeed-poller", func(ctx context.Context) {
defer close(doneCh)
defer kvFeedMemMon.Stop(ctx)
errCh <- kvfeed.Run(ctx, kvfeedCfg)
}); err != nil {
return nil, nil, nil, err
}
return buf, doneCh, errCh, nil
}
func (ca *changeAggregator) waitForKVFeedDone() {
if ca.kvFeedDoneCh != nil {
<-ca.kvFeedDoneCh
}
}
func (ca *changeAggregator) checkKVFeedErr() error {
select {
case err := <-ca.errCh:
return err
default:
return nil
}
}
func (ca *changeAggregator) makeKVFeedCfg(
ctx context.Context,
config ChangefeedConfig,
spans []roachpb.Span,
buf kvevent.Writer,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
memMon *mon.BytesMonitor,
) (kvfeed.Config, error) {
schemaChange, err := config.Opts.GetSchemaChangeHandlingOptions()
if err != nil {
return kvfeed.Config{}, err
}
filters := config.Opts.GetFilters()
cfg := ca.flowCtx.Cfg
initialScanOnly := config.EndTime.EqOrdering(initialHighWater)
var sf schemafeed.SchemaFeed
if schemaChange.Policy == changefeedbase.OptSchemaChangePolicyIgnore || initialScanOnly {
sf = schemafeed.DoNothingSchemaFeed
} else {
sf = schemafeed.New(ctx, cfg, schemaChange.EventClass, AllTargets(ca.spec.Feed),
initialHighWater, &ca.metrics.SchemaFeedMetrics, config.Opts.GetCanHandle())
}
return kvfeed.Config{
Writer: buf,
Settings: cfg.Settings,
DB: cfg.DB.KV(),
Codec: cfg.Codec,
Clock: cfg.DB.KV().Clock(),
Spans: spans,
CheckpointSpans: ca.spec.Checkpoint.Spans,
CheckpointTimestamp: ca.spec.Checkpoint.Timestamp,
Targets: AllTargets(ca.spec.Feed),
Metrics: &ca.metrics.KVFeedMetrics,
OnBackfillCallback: ca.sliMetrics.getBackfillCallback(),
OnBackfillRangeCallback: ca.sliMetrics.getBackfillRangeCallback(),
MM: memMon,
InitialHighWater: initialHighWater,
EndTime: config.EndTime,
WithDiff: filters.WithDiff,
NeedsInitialScan: needsInitialScan,
SchemaChangeEvents: schemaChange.EventClass,
SchemaChangePolicy: schemaChange.Policy,
SchemaFeed: sf,
Knobs: ca.knobs.FeedKnobs,
UseMux: changefeedbase.UseMuxRangeFeed.Get(&cfg.Settings.SV),
RangeObserver: makeLaggingRangesObserver(ctx, cfg.Settings, ca.sliMetrics),
}, nil
}
func makeLaggingRangesObserver(
ctx context.Context, settings *cluster.Settings, sliMetrics *sliMetrics,
) func(fn kvcoord.ForEachRangeFn) {
updateLaggingRanges := sliMetrics.getLaggingRangesCallback()
return func(fn kvcoord.ForEachRangeFn) {
go func() {
log.Info(ctx, "starting changefeed lagging ranges observer")
// Reset metrics on shutdown.
defer func() {
updateLaggingRanges(0)
}()
for {
select {
case <-ctx.Done():
log.Infof(ctx, "changefeed lagging ranges observer shutting down: %s", ctx.Err())
return
case <-time.After(changefeedbase.LaggingRangesCheckFrequency.Get(&settings.SV)):
count := int64(0)
thresholdTS := timeutil.Now().Add(-1 * changefeedbase.LaggingRangesThreshold.Get(&settings.SV))
i := 0
err := fn(func(rfCtx kvcoord.RangeFeedContext, feed kvcoord.PartialRangeFeed) error {
// The resolved timestamp of a range determines the timestamp which is caught up to.
// However, during catchup scans, this is not set. For catchup scans,
// we consider the time the partial rangefeed was created to be its starting time.
ts := feed.Resolved
if ts.IsEmpty() {
ts = hlc.Timestamp{WallTime: feed.CreatedTime.UnixNano()}
}
i += 1
if ts.Less(hlc.Timestamp{WallTime: thresholdTS.UnixNano()}) {
count += 1
}
return nil
})
// We expect `fn` to only return errors which are returned by the function parameter.
// Since the parameter does not return errors, we expect no errors to occur.
if err != nil {
logcrash.ReportOrPanic(ctx, &settings.SV, "changefeed lagging ranges observer encountered error ")
}
updateLaggingRanges(count)
}
}
}()
}
}
// setupSpans is called on start to extract the spans for this changefeed as a
// slice and creates a span frontier with the initial resolved timestamps. This
// SpanFrontier only tracks the spans being watched on this node. There is a
// different SpanFrontier elsewhere for the entire changefeed. This object is
// used to filter out some previously emitted rows, and by the cloudStorageSink
// to name its output files in lexicographically monotonic fashion.
func (ca *changeAggregator) setupSpansAndFrontier() (spans []roachpb.Span, err error) {
var initialHighWater hlc.Timestamp
spans = make([]roachpb.Span, 0, len(ca.spec.Watches))
for _, watch := range ca.spec.Watches {
if initialHighWater.IsEmpty() || watch.InitialResolved.Less(initialHighWater) {
initialHighWater = watch.InitialResolved
}
spans = append(spans, watch.Span)
}
ca.frontier, err = makeSchemaChangeFrontier(initialHighWater, spans...)
if err != nil {
return nil, err
}
if initialHighWater.IsEmpty() {
// If we are performing initial scan, set frontier initialHighWater
// to the StatementTime -- this is the time we will be scanning spans.
// Spans that reach this time are eligible for checkpointing.
ca.frontier.initialHighWater = ca.spec.Feed.StatementTime
}
checkpointedSpanTs := ca.spec.Checkpoint.Timestamp
// Checkpoint records from 21.2 were used only for backfills and did not store
// the timestamp, since in a backfill it must either be the StatementTime for
// an initial backfill, or right after the high-water for schema backfills.
if checkpointedSpanTs.IsEmpty() {
if initialHighWater.IsEmpty() {
checkpointedSpanTs = ca.spec.Feed.StatementTime
} else {
checkpointedSpanTs = initialHighWater.Next()
}
}
// Checkpointed spans are spans that were above the highwater mark, and we
// must preserve that information in the frontier for future checkpointing.
for _, checkpointedSpan := range ca.spec.Checkpoint.Spans {
if _, err := ca.frontier.Forward(checkpointedSpan, checkpointedSpanTs); err != nil {
return nil, err
}
}
return spans, nil
}
// close has two purposes: to synchronize on the completion of the helper
// goroutines created by the Start method, and to clean up any resources used by
// the processor.
//
// Due to the fact that this method may be called even if the processor did not
// finish completion, there is an excessive amount of nil checking. For example,
// (*changeAggregator) Start() may encounter an error and move the processor to
// draining before one of the fields below (ex. ca.drainDone) is set.
func (ca *changeAggregator) close() {
if ca.Closed {
return
}
ca.cancel()
// Wait for the poller to finish shutting down.
ca.waitForKVFeedDone()
if ca.drainDone != nil {
ca.drainDone()
}
if ca.eventConsumer != nil {
_ = ca.eventConsumer.Close() // context cancellation expected here.
}
if ca.closeTelemetryRecorder != nil {
ca.closeTelemetryRecorder()
}
if ca.sink != nil {
// Best effort: context is often cancel by now, so we expect to see an error
_ = ca.sink.Close()
}
ca.closeMetrics()
ca.memAcc.Close(ca.Ctx())
ca.MemMonitor.Stop(ca.Ctx())
ca.InternalClose()
}
var aggregatorHeartbeatFrequency = settings.RegisterDurationSetting(
settings.TenantWritable,
"changefeed.aggregator.heartbeat",
"changefeed aggregator will emit a heartbeat message to the coordinator with this frequency; 0 disables. "+
"The setting value should be <=1/2 of server.shutdown.jobs_wait period",
4*time.Second,
settings.NonNegativeDuration,
)
// Next is part of the RowSource interface.
func (ca *changeAggregator) Next() (rowenc.EncDatumRow, *execinfrapb.ProducerMetadata) {
shouldEmitHeartBeat := func() bool {
freq := aggregatorHeartbeatFrequency.Get(&ca.FlowCtx.Cfg.Settings.SV)
return freq > 0 && timeutil.Since(ca.lastPush) > freq
}
// helper to iterate frontier and return the list of changefeed frontier spans.
getFrontierSpans := func() (spans []execinfrapb.ChangefeedMeta_FrontierSpan) {
ca.frontier.Entries(func(r roachpb.Span, ts hlc.Timestamp) (done span.OpResult) {
spans = append(spans,
execinfrapb.ChangefeedMeta_FrontierSpan{
Span: r,
Timestamp: ts,
})
return span.ContinueMatch
})
return spans
}
for ca.State == execinfra.StateRunning {
if !ca.changedRowBuf.IsEmpty() {
ca.lastPush = timeutil.Now()
return ca.ProcessRowHelper(ca.changedRowBuf.Pop()), nil
} else if !ca.resolvedSpanBuf.IsEmpty() {
ca.lastPush = timeutil.Now()
return ca.ProcessRowHelper(ca.resolvedSpanBuf.Pop()), nil
} else if shouldEmitHeartBeat() {
// heartbeat is simply an attempt to push a row into process row helper.
// This mechanism allows coordinator to propagate shutdown information to
// all aggregators -- that is, inability to write to this channel will
// trigger aggregator to transition away from StateRunning.
ca.lastPush = timeutil.Now()
return nil, &execinfrapb.ProducerMetadata{
Changefeed: &execinfrapb.ChangefeedMeta{Heartbeat: true},
}
}
// As the last gasp before shutdown, transmit an up-to-date frontier
// information to the coordinator. We expect to get this signal via the
// polling below before the drain actually occurs and starts tearing
// things down.
if err := ca.checkForNodeDrain(); err != nil {
nodeID, _ := ca.FlowCtx.Cfg.NodeID.OptionalNodeID()
meta := &execinfrapb.ChangefeedMeta{
DrainInfo: &execinfrapb.ChangefeedMeta_DrainInfo{NodeID: nodeID},
Checkpoint: getFrontierSpans(),
}
ca.AppendTrailingMeta(execinfrapb.ProducerMetadata{Changefeed: meta})
ca.shutdownCheckpointEmitted = true
ca.cancel()
ca.MoveToDraining(err)
break
}
if err := ca.tick(); err != nil {
var e kvevent.ErrBufferClosed
if errors.As(err, &e) {
// ErrBufferClosed is a signal that our kvfeed has exited expectedly.
err = e.Unwrap()
if errors.Is(err, kvevent.ErrNormalRestartReason) {
err = nil
}
} else {
// If the poller errored first, that's the
// interesting one, so overwrite `err`.
if kvFeedErr := ca.checkKVFeedErr(); kvFeedErr != nil {
err = kvFeedErr
}
}
// Shut down the poller if it wasn't already.
ca.cancel()
ca.MoveToDraining(err)
break
}
}
if !ca.shutdownCheckpointEmitted {
// Aggregator shutdown may be initiated by the coordinator.
// Emit an up-to-date frontier information in case coordinator will restart.
ca.shutdownCheckpointEmitted = true
ca.AppendTrailingMeta(execinfrapb.ProducerMetadata{
Changefeed: &execinfrapb.ChangefeedMeta{Checkpoint: getFrontierSpans()},
})
}
return nil, ca.DrainHelper()
}
// tick is the workhorse behind Next(). It retrieves the next event from
// kvFeed, sends off this event to the event consumer, and flushes the sink
// if necessary.
func (ca *changeAggregator) tick() error {
event, err := ca.eventProducer.Get(ca.Ctx())
if err != nil {
return err
}
queuedNanos := timeutil.Since(event.BufferAddTimestamp()).Nanoseconds()
ca.metrics.QueueTimeNanos.Inc(queuedNanos)
switch event.Type() {
case kvevent.TypeKV:
// Keep track of SLI latency for non-backfill/rangefeed KV events.
if event.BackfillTimestamp().IsEmpty() {
ca.sliMetrics.AdmitLatency.RecordValue(timeutil.Since(event.Timestamp().GoTime()).Nanoseconds())
}
ca.recentKVCount++
return ca.eventConsumer.ConsumeEvent(ca.Ctx(), event)
case kvevent.TypeResolved:
a := event.DetachAlloc()
a.Release(ca.Ctx())
resolved := event.Resolved()
if ca.knobs.FilterSpanWithMutation != nil {
shouldFilter, err := ca.knobs.FilterSpanWithMutation(&resolved)
if err != nil {
return err
}
if shouldFilter {
return nil
}
}
return ca.noteResolvedSpan(resolved)
case kvevent.TypeFlush:
return ca.flushBufferedEvents()
}
return nil
}
func (ca *changeAggregator) flushBufferedEvents() error {
if err := ca.eventConsumer.Flush(ca.Ctx()); err != nil {
return err
}
return ca.sink.Flush(ca.Ctx())
}
// noteResolvedSpan periodically flushes Frontier progress from the current
// changeAggregator node to the changeFrontier node to allow the changeFrontier
// to persist the overall changefeed's progress
func (ca *changeAggregator) noteResolvedSpan(resolved jobspb.ResolvedSpan) error {
if resolved.Timestamp.IsEmpty() {
// @0.0 resolved timestamps could come in from rangefeed checkpoint.
// When rangefeed starts running, it emits @0.0 resolved timestamp.
// We don't care about those as far as checkpointing concerned.
return nil
}
advanced, err := ca.frontier.ForwardResolvedSpan(resolved)
if err != nil {
return err
}
// The resolved sliMetric data backs the aggregator_progress metric
if advanced {
ca.sliMetrics.setResolved(ca.sliMetricsID, ca.frontier.Frontier())
}
forceFlush := resolved.BoundaryType != jobspb.ResolvedSpan_NONE
checkpointFrontier := advanced &&
(forceFlush || timeutil.Since(ca.lastHighWaterFlush) > ca.flushFrequency)
if checkpointFrontier {
defer func() {
ca.lastHighWaterFlush = timeutil.Now()
}()
return ca.flushFrontier()
}
// At a lower frequency we checkpoint specific spans in the job progress
// either in backfills or if the highwater mark is excessively lagging behind
checkpointSpans := ca.spec.JobID != 0 && /* enterprise changefeed */
(resolved.Timestamp.Equal(ca.frontier.BackfillTS()) ||
ca.frontier.hasLaggingSpans(ca.spec.Feed.StatementTime, &ca.flowCtx.Cfg.Settings.SV)) &&
canCheckpointSpans(&ca.flowCtx.Cfg.Settings.SV, ca.lastSpanFlush)
if checkpointSpans {
defer func() {
ca.lastSpanFlush = timeutil.Now()
}()
return ca.flushFrontier()
}
return nil
}
// flushFrontier flushes sink and emits resolved timestamp if needed.
func (ca *changeAggregator) flushFrontier() error {
// Make sure to the sink before forwarding resolved spans,
// otherwise, we could lose buffered messages and violate the
// at-least-once guarantee. This is also true for checkpointing the
// resolved spans in the job progress.
if err := ca.flushBufferedEvents(); err != nil {
return err
}
// Iterate frontier spans and build a list of spans to emit.
var batch jobspb.ResolvedSpans
ca.frontier.Entries(func(s roachpb.Span, ts hlc.Timestamp) span.OpResult {
boundaryType := jobspb.ResolvedSpan_NONE
if ca.frontier.boundaryTime.Equal(ts) {
boundaryType = ca.frontier.boundaryType
}
batch.ResolvedSpans = append(batch.ResolvedSpans, jobspb.ResolvedSpan{
Span: s,
Timestamp: ts,
BoundaryType: boundaryType,
})
return span.ContinueMatch
})
return ca.emitResolved(batch)
}
func (ca *changeAggregator) emitResolved(batch jobspb.ResolvedSpans) error {
progressUpdate := jobspb.ResolvedSpans{
ResolvedSpans: batch.ResolvedSpans,
Stats: jobspb.ResolvedSpans_Stats{
RecentKvCount: ca.recentKVCount,
},
}
updateBytes, err := protoutil.Marshal(&progressUpdate)
if err != nil {
return err
}
ca.resolvedSpanBuf.Push(rowenc.EncDatumRow{
rowenc.EncDatum{Datum: tree.NewDBytes(tree.DBytes(updateBytes))},
rowenc.EncDatum{Datum: tree.DNull}, // topic
rowenc.EncDatum{Datum: tree.DNull}, // key
rowenc.EncDatum{Datum: tree.DNull}, // value
})
ca.metrics.ResolvedMessages.Inc(1)
ca.recentKVCount = 0
return nil
}
// closeMetrics de-registers the aggregator from the sliMetrics registry so that
// it's no longer considered by the aggregator_progress gauge
func (ca *changeAggregator) closeMetrics() {
ca.sliMetrics.closeId(ca.sliMetricsID)
}
// ConsumerClosed is part of the RowSource interface.
func (ca *changeAggregator) ConsumerClosed() {
// The consumer is done, Next() will not be called again.
ca.close()
}
func (ca *changeAggregator) isSinkless() bool {
return ca.spec.JobID == 0
}
const (
emitAllResolved = 0
emitNoResolved = -1
)
type changeFrontier struct {
execinfra.ProcessorBase
flowCtx *execinfra.FlowCtx
spec execinfrapb.ChangeFrontierSpec
memAcc mon.BoundAccount
a tree.DatumAlloc
// input returns rows from one or more changeAggregator processors
input execinfra.RowSource
// frontier contains the current resolved timestamp high-water for the tracked
// span set.
frontier *schemaChangeFrontier
// localState contains an in memory cache of progress updates.
// Used by core style changefeeds as well as regular changefeeds to make
// restarts more efficient with respects to duplicates.
// localState reflects an up-to-date information *after* the checkpoint.
localState *cachedState
// encoder is the Encoder to use for resolved timestamp serialization.
encoder Encoder
// sink is the Sink to write resolved timestamps to. Rows are never written
// by changeFrontier.
sink ResolvedTimestampSink
// freqEmitResolved, if >= 0, is a lower bound on the duration between
// resolved timestamp emits.
freqEmitResolved time.Duration
// lastEmitResolved is the last time a resolved timestamp was emitted.
lastEmitResolved time.Time
// slowLogEveryN rate-limits the logging of slow spans
slowLogEveryN log.EveryN
// lastProtectedTimestampUpdate is the last time the protected timestamp
// record was updated to the frontier's highwater mark
lastProtectedTimestampUpdate time.Time
// js, if non-nil, is called to checkpoint the changefeed's
// progress in the corresponding system job entry.
js *jobState
// highWaterAtStart is the greater of the job high-water and the timestamp the
// CHANGEFEED statement was run at. It's used in an assertion that we never
// regress the job high-water.
highWaterAtStart hlc.Timestamp
// passthroughBuf, in some but not all flows, contains changed row data to
// pass through unchanged to the gateway node.
passthroughBuf encDatumRowBuffer
// resolvedBuf, if non-nil, contains rows indicating a changefeed-level
// resolved timestamp to be returned. It depends on everything in
// `passthroughBuf` being sent, so that one needs to be emptied first.
resolvedBuf *encDatumRowBuffer
// metrics are monitoring counters shared between all changefeeds.
metrics *Metrics
sliMetrics *sliMetrics
// sliMetricsID and metricsID uniquely identify the changefeed in the metrics's
// map (a shared struct across all changefeeds on the node) and the sliMetrics's
// map (shared structure between all feeds within the same scope on the node).
metricsID int
sliMetricsID int64
knobs TestingKnobs
}
const (
runStatusUpdateFrequency time.Duration = time.Minute
slowSpanMaxFrequency = 10 * time.Second
)
// jobState encapsulates changefeed job state.
type jobState struct {
// job is set for changefeeds other than core/sinkless changefeeds.
job *jobs.Job
settings *cluster.Settings
metrics *Metrics
ts timeutil.TimeSource
// The last time we updated job run status.
lastRunStatusUpdate time.Time
// The last time we updated job progress.
lastProgressUpdate time.Time
// How long checkpoint (job progress update) expected to take.
checkpointDuration time.Duration
// Flag set if we skip some updates due to rapid progress update requests.
progressUpdatesSkipped bool
}
// cachedState is a changefeed progress stored in memory.
// It is used to reduce the number of duplicate events emitted during retries.
type cachedState struct {
progress jobspb.Progress
// set of spans for this changefeed.
trackedSpans roachpb.Spans
// aggregatorFrontier contains the list of frontier spans
// emitted by aggregators when they are being shut down.
aggregatorFrontier []execinfrapb.ChangefeedMeta_FrontierSpan
// drainingNodes is the list of nodes that are draining.
drainingNodes []roachpb.NodeID
}
// SetHighwater implements the eval.ChangefeedState interface.
func (cs *cachedState) SetHighwater(frontier hlc.Timestamp) {
cs.progress.Progress = &jobspb.Progress_HighWater{
HighWater: &frontier,
}
}
// SetCheckpoint implements the eval.ChangefeedState interface.
func (cs *cachedState) SetCheckpoint(spans []roachpb.Span, timestamp hlc.Timestamp) {
changefeedProgress := cs.progress.Details.(*jobspb.Progress_Changefeed).Changefeed
changefeedProgress.Checkpoint = &jobspb.ChangefeedProgress_Checkpoint{
Spans: spans,
Timestamp: timestamp,
}
}
func newJobState(
j *jobs.Job, st *cluster.Settings, metrics *Metrics, ts timeutil.TimeSource,
) *jobState {
return &jobState{
job: j,
settings: st,
metrics: metrics,
ts: ts,
lastProgressUpdate: ts.Now(),
}
}