-
Notifications
You must be signed in to change notification settings - Fork 797
/
ingester.go
2647 lines (2157 loc) · 86 KB
/
ingester.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package ingester
import (
"context"
"flag"
"fmt"
"io"
"math"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/gogo/status"
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/thanos-io/objstore"
"github.com/thanos-io/thanos/pkg/block/metadata"
"github.com/thanos-io/thanos/pkg/shipper"
"github.com/thanos-io/thanos/pkg/store/storepb"
"github.com/weaveworks/common/httpgrpc"
"go.uber.org/atomic"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc/codes"
"github.com/cortexproject/cortex/pkg/chunk/encoding"
"github.com/cortexproject/cortex/pkg/cortexpb"
"github.com/cortexproject/cortex/pkg/ingester/client"
"github.com/cortexproject/cortex/pkg/querysharding"
"github.com/cortexproject/cortex/pkg/ring"
"github.com/cortexproject/cortex/pkg/storage/bucket"
cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
"github.com/cortexproject/cortex/pkg/tenant"
"github.com/cortexproject/cortex/pkg/util"
"github.com/cortexproject/cortex/pkg/util/concurrency"
"github.com/cortexproject/cortex/pkg/util/extract"
logutil "github.com/cortexproject/cortex/pkg/util/log"
util_math "github.com/cortexproject/cortex/pkg/util/math"
"github.com/cortexproject/cortex/pkg/util/services"
"github.com/cortexproject/cortex/pkg/util/spanlogger"
"github.com/cortexproject/cortex/pkg/util/validation"
)
const (
// RingKey is the key under which we store the ingesters ring in the KVStore.
RingKey = "ring"
)
const (
errTSDBCreateIncompatibleState = "cannot create a new TSDB while the ingester is not in active state (current state: %s)"
errTSDBIngest = "err: %v. timestamp=%s, series=%s" // Using error.Wrap puts the message before the error and if the series is too long, its truncated.
errTSDBIngestExemplar = "err: %v. timestamp=%s, series=%s, exemplar=%s"
// Jitter applied to the idle timeout to prevent compaction in all ingesters concurrently.
compactionIdleTimeoutJitter = 0.25
instanceIngestionRateTickInterval = time.Second
// Number of timeseries to return in each batch of a QueryStream.
queryStreamBatchSize = 128
metadataStreamBatchSize = 128
// Discarded Metadata metric labels.
perUserMetadataLimit = "per_user_metadata_limit"
perMetricMetadataLimit = "per_metric_metadata_limit"
// Period at which to attempt purging metadata from memory.
metadataPurgePeriod = 5 * time.Minute
)
var (
errExemplarRef = errors.New("exemplars not ingested because series not already present")
errIngesterStopping = errors.New("ingester stopping")
)
// Config for an Ingester.
type Config struct {
LifecyclerConfig ring.LifecyclerConfig `yaml:"lifecycler"`
// Config for metadata purging.
MetadataRetainPeriod time.Duration `yaml:"metadata_retain_period"`
RateUpdatePeriod time.Duration `yaml:"rate_update_period"`
ActiveSeriesMetricsEnabled bool `yaml:"active_series_metrics_enabled"`
ActiveSeriesMetricsUpdatePeriod time.Duration `yaml:"active_series_metrics_update_period"`
ActiveSeriesMetricsIdleTimeout time.Duration `yaml:"active_series_metrics_idle_timeout"`
// Use blocks storage.
BlocksStorageConfig cortex_tsdb.BlocksStorageConfig `yaml:"-"`
// Injected at runtime and read from the distributor config, required
// to accurately apply global limits.
DistributorShardingStrategy string `yaml:"-"`
DistributorShardByAllLabels bool `yaml:"-"`
// Injected at runtime and read from querier config.
QueryStoreForLabels bool `yaml:"-"`
QueryIngestersWithin time.Duration `yaml:"-"`
DefaultLimits InstanceLimits `yaml:"instance_limits"`
InstanceLimitsFn func() *InstanceLimits `yaml:"-"`
IgnoreSeriesLimitForMetricNames string `yaml:"ignore_series_limit_for_metric_names"`
// For testing, you can override the address and ID of this ingester.
ingesterClientFactory func(addr string, cfg client.Config) (client.HealthAndIngesterClient, error)
}
// RegisterFlags adds the flags required to config this to the given FlagSet
func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.LifecyclerConfig.RegisterFlags(f)
f.DurationVar(&cfg.MetadataRetainPeriod, "ingester.metadata-retain-period", 10*time.Minute, "Period at which metadata we have not seen will remain in memory before being deleted.")
f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.")
f.BoolVar(&cfg.ActiveSeriesMetricsEnabled, "ingester.active-series-metrics-enabled", true, "Enable tracking of active series and export them as metrics.")
f.DurationVar(&cfg.ActiveSeriesMetricsUpdatePeriod, "ingester.active-series-metrics-update-period", 1*time.Minute, "How often to update active series metrics.")
f.DurationVar(&cfg.ActiveSeriesMetricsIdleTimeout, "ingester.active-series-metrics-idle-timeout", 10*time.Minute, "After what time a series is considered to be inactive.")
f.Float64Var(&cfg.DefaultLimits.MaxIngestionRate, "ingester.instance-limits.max-ingestion-rate", 0, "Max ingestion rate (samples/sec) that ingester will accept. This limit is per-ingester, not per-tenant. Additional push requests will be rejected. Current ingestion rate is computed as exponentially weighted moving average, updated every second. This limit only works when using blocks engine. 0 = unlimited.")
f.Int64Var(&cfg.DefaultLimits.MaxInMemoryTenants, "ingester.instance-limits.max-tenants", 0, "Max users that this ingester can hold. Requests from additional users will be rejected. This limit only works when using blocks engine. 0 = unlimited.")
f.Int64Var(&cfg.DefaultLimits.MaxInMemorySeries, "ingester.instance-limits.max-series", 0, "Max series that this ingester can hold (across all tenants). Requests to create additional series will be rejected. This limit only works when using blocks engine. 0 = unlimited.")
f.Int64Var(&cfg.DefaultLimits.MaxInflightPushRequests, "ingester.instance-limits.max-inflight-push-requests", 0, "Max inflight push requests that this ingester can handle (across all tenants). Additional requests will be rejected. 0 = unlimited.")
f.StringVar(&cfg.IgnoreSeriesLimitForMetricNames, "ingester.ignore-series-limit-for-metric-names", "", "Comma-separated list of metric names, for which -ingester.max-series-per-metric and -ingester.max-global-series-per-metric limits will be ignored. Does not affect max-series-per-user or max-global-series-per-metric limits.")
}
func (cfg *Config) getIgnoreSeriesLimitForMetricNamesMap() map[string]struct{} {
if cfg.IgnoreSeriesLimitForMetricNames == "" {
return nil
}
result := map[string]struct{}{}
for _, s := range strings.Split(cfg.IgnoreSeriesLimitForMetricNames, ",") {
tr := strings.TrimSpace(s)
if tr != "" {
result[tr] = struct{}{}
}
}
if len(result) == 0 {
return nil
}
return result
}
// Ingester deals with "in flight" chunks. Based on Prometheus 1.x
// MemorySeriesStorage.
type Ingester struct {
*services.BasicService
cfg Config
metrics *ingesterMetrics
logger log.Logger
lifecycler *ring.Lifecycler
limits *validation.Overrides
limiter *Limiter
subservicesWatcher *services.FailureWatcher
stoppedMtx sync.RWMutex // protects stopped
stopped bool // protected by stoppedMtx
// For storing metadata ingested.
usersMetadataMtx sync.RWMutex
usersMetadata map[string]*userMetricsMetadata
// Prometheus block storage
TSDBState TSDBState
// Rate of pushed samples. Only used by V2-ingester to limit global samples push rate.
ingestionRate *util_math.EwmaRate
inflightPushRequests atomic.Int64
}
// Shipper interface is used to have an easy way to mock it in tests.
type Shipper interface {
Sync(ctx context.Context) (uploaded int, err error)
}
type tsdbState int
const (
active tsdbState = iota // Pushes are allowed.
activeShipping // Pushes are allowed. Blocks shipping is in progress.
forceCompacting // TSDB is being force-compacted.
closing // Used while closing idle TSDB.
closed // Used to avoid setting closing back to active in closeAndDeleteIdleUsers method.
)
// Describes result of TSDB-close check. String is used as metric label.
type tsdbCloseCheckResult string
const (
tsdbIdle tsdbCloseCheckResult = "idle" // Not reported via metrics. Metrics use tsdbIdleClosed on success.
tsdbShippingDisabled tsdbCloseCheckResult = "shipping_disabled"
tsdbNotIdle tsdbCloseCheckResult = "not_idle"
tsdbNotCompacted tsdbCloseCheckResult = "not_compacted"
tsdbNotShipped tsdbCloseCheckResult = "not_shipped"
tsdbCheckFailed tsdbCloseCheckResult = "check_failed"
tsdbCloseFailed tsdbCloseCheckResult = "close_failed"
tsdbNotActive tsdbCloseCheckResult = "not_active"
tsdbDataRemovalFailed tsdbCloseCheckResult = "data_removal_failed"
tsdbTenantMarkedForDeletion tsdbCloseCheckResult = "tenant_marked_for_deletion"
tsdbIdleClosed tsdbCloseCheckResult = "idle_closed" // Success.
)
func (r tsdbCloseCheckResult) shouldClose() bool {
return r == tsdbIdle || r == tsdbTenantMarkedForDeletion
}
type userTSDB struct {
db *tsdb.DB
userID string
activeSeries *ActiveSeries
seriesInMetric *metricCounter
limiter *Limiter
instanceSeriesCount *atomic.Int64 // Shared across all userTSDB instances created by ingester.
instanceLimitsFn func() *InstanceLimits
stateMtx sync.RWMutex
state tsdbState
pushesInFlight sync.WaitGroup // Increased with stateMtx read lock held, only if state == active or activeShipping.
// Used to detect idle TSDBs.
lastUpdate atomic.Int64
// Thanos shipper used to ship blocks to the storage.
shipper Shipper
// When deletion marker is found for the tenant (checked before shipping),
// shipping stops and TSDB is closed before reaching idle timeout time (if enabled).
deletionMarkFound atomic.Bool
// Unix timestamp of last deletion mark check.
lastDeletionMarkCheck atomic.Int64
// for statistics
ingestedAPISamples *util_math.EwmaRate
ingestedRuleSamples *util_math.EwmaRate
// Cached shipped blocks.
shippedBlocksMtx sync.Mutex
shippedBlocks map[ulid.ULID]struct{}
}
// Explicitly wrapping the tsdb.DB functions that we use.
func (u *userTSDB) Appender(ctx context.Context) storage.Appender {
return u.db.Appender(ctx)
}
func (u *userTSDB) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
return u.db.Querier(ctx, mint, maxt)
}
func (u *userTSDB) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
return u.db.ChunkQuerier(ctx, mint, maxt)
}
func (u *userTSDB) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
return u.db.ExemplarQuerier(ctx)
}
func (u *userTSDB) Head() *tsdb.Head {
return u.db.Head()
}
func (u *userTSDB) Blocks() []*tsdb.Block {
return u.db.Blocks()
}
func (u *userTSDB) Close() error {
return u.db.Close()
}
func (u *userTSDB) Compact() error {
return u.db.Compact()
}
func (u *userTSDB) StartTime() (int64, error) {
return u.db.StartTime()
}
func (u *userTSDB) casState(from, to tsdbState) bool {
u.stateMtx.Lock()
defer u.stateMtx.Unlock()
if u.state != from {
return false
}
u.state = to
return true
}
// compactHead compacts the Head block at specified block durations avoiding a single huge block.
func (u *userTSDB) compactHead(blockDuration int64) error {
if !u.casState(active, forceCompacting) {
return errors.New("TSDB head cannot be compacted because it is not in active state (possibly being closed or blocks shipping in progress)")
}
defer u.casState(forceCompacting, active)
// Ingestion of samples in parallel with forced compaction can lead to overlapping blocks,
// and possible invalidation of the references returned from Appender.GetRef().
// So we wait for existing in-flight requests to finish. Future push requests would fail until compaction is over.
u.pushesInFlight.Wait()
h := u.Head()
minTime, maxTime := h.MinTime(), h.MaxTime()
for (minTime/blockDuration)*blockDuration != (maxTime/blockDuration)*blockDuration {
// Data in Head spans across multiple block ranges, so we break it into blocks here.
// Block max time is exclusive, so we do a -1 here.
blockMaxTime := ((minTime/blockDuration)+1)*blockDuration - 1
if err := u.db.CompactHead(tsdb.NewRangeHead(h, minTime, blockMaxTime)); err != nil {
return err
}
// Get current min/max times after compaction.
minTime, maxTime = h.MinTime(), h.MaxTime()
}
return u.db.CompactHead(tsdb.NewRangeHead(h, minTime, maxTime))
}
// PreCreation implements SeriesLifecycleCallback interface.
func (u *userTSDB) PreCreation(metric labels.Labels) error {
if u.limiter == nil {
return nil
}
// Verify ingester's global limit
gl := u.instanceLimitsFn()
if gl != nil && gl.MaxInMemorySeries > 0 {
if series := u.instanceSeriesCount.Load(); series >= gl.MaxInMemorySeries {
return errMaxSeriesLimitReached
}
}
// Total series limit.
if err := u.limiter.AssertMaxSeriesPerUser(u.userID, int(u.Head().NumSeries())); err != nil {
return err
}
// Series per metric name limit.
metricName, err := extract.MetricNameFromLabels(metric)
if err != nil {
return err
}
if err := u.seriesInMetric.canAddSeriesFor(u.userID, metricName); err != nil {
return err
}
return nil
}
// PostCreation implements SeriesLifecycleCallback interface.
func (u *userTSDB) PostCreation(metric labels.Labels) {
u.instanceSeriesCount.Inc()
metricName, err := extract.MetricNameFromLabels(metric)
if err != nil {
// This should never happen because it has already been checked in PreCreation().
return
}
u.seriesInMetric.increaseSeriesForMetric(metricName)
}
// PostDeletion implements SeriesLifecycleCallback interface.
func (u *userTSDB) PostDeletion(metrics ...labels.Labels) {
u.instanceSeriesCount.Sub(int64(len(metrics)))
for _, metric := range metrics {
metricName, err := extract.MetricNameFromLabels(metric)
if err != nil {
// This should never happen because it has already been checked in PreCreation().
continue
}
u.seriesInMetric.decreaseSeriesForMetric(metricName)
}
}
// blocksToDelete filters the input blocks and returns the blocks which are safe to be deleted from the ingester.
func (u *userTSDB) blocksToDelete(blocks []*tsdb.Block) map[ulid.ULID]struct{} {
if u.db == nil {
return nil
}
deletable := tsdb.DefaultBlocksToDelete(u.db)(blocks)
if u.shipper == nil {
return deletable
}
shippedBlocks := u.getCachedShippedBlocks()
result := map[ulid.ULID]struct{}{}
for shippedID := range shippedBlocks {
if _, ok := deletable[shippedID]; ok {
result[shippedID] = struct{}{}
}
}
return result
}
// updateCachedShipperBlocks reads the shipper meta file and updates the cached shipped blocks.
func (u *userTSDB) updateCachedShippedBlocks() error {
shipperMeta, err := shipper.ReadMetaFile(u.db.Dir())
if os.IsNotExist(err) || os.IsNotExist(errors.Cause(err)) {
// If the meta file doesn't exist it means the shipper hasn't run yet.
shipperMeta = &shipper.Meta{}
} else if err != nil {
return err
}
// Build a map.
shippedBlocks := make(map[ulid.ULID]struct{}, len(shipperMeta.Uploaded))
for _, blockID := range shipperMeta.Uploaded {
shippedBlocks[blockID] = struct{}{}
}
// Cache it.
u.shippedBlocksMtx.Lock()
u.shippedBlocks = shippedBlocks
u.shippedBlocksMtx.Unlock()
return nil
}
// getCachedShippedBlocks returns the cached shipped blocks.
func (u *userTSDB) getCachedShippedBlocks() map[ulid.ULID]struct{} {
u.shippedBlocksMtx.Lock()
defer u.shippedBlocksMtx.Unlock()
// It's safe to directly return the map because it's never updated in-place.
return u.shippedBlocks
}
// getOldestUnshippedBlockTime returns the unix timestamp with milliseconds precision of the oldest
// TSDB block not shipped to the storage yet, or 0 if all blocks have been shipped.
func (u *userTSDB) getOldestUnshippedBlockTime() uint64 {
shippedBlocks := u.getCachedShippedBlocks()
oldestTs := uint64(0)
for _, b := range u.Blocks() {
if _, ok := shippedBlocks[b.Meta().ULID]; ok {
continue
}
if oldestTs == 0 || b.Meta().ULID.Time() < oldestTs {
oldestTs = b.Meta().ULID.Time()
}
}
return oldestTs
}
func (u *userTSDB) isIdle(now time.Time, idle time.Duration) bool {
lu := u.lastUpdate.Load()
return time.Unix(lu, 0).Add(idle).Before(now)
}
func (u *userTSDB) setLastUpdate(t time.Time) {
u.lastUpdate.Store(t.Unix())
}
// Checks if TSDB can be closed.
func (u *userTSDB) shouldCloseTSDB(idleTimeout time.Duration) tsdbCloseCheckResult {
if u.deletionMarkFound.Load() {
return tsdbTenantMarkedForDeletion
}
if !u.isIdle(time.Now(), idleTimeout) {
return tsdbNotIdle
}
// If head is not compacted, we cannot close this yet.
if u.Head().NumSeries() > 0 {
return tsdbNotCompacted
}
// Ensure that all blocks have been shipped.
if oldest := u.getOldestUnshippedBlockTime(); oldest > 0 {
return tsdbNotShipped
}
return tsdbIdle
}
// TSDBState holds data structures used by the TSDB storage engine
type TSDBState struct {
dbs map[string]*userTSDB // tsdb sharded by userID
bucket objstore.Bucket
// Value used by shipper as external label.
shipperIngesterID string
subservices *services.Manager
tsdbMetrics *tsdbMetrics
forceCompactTrigger chan requestWithUsersAndCallback
shipTrigger chan requestWithUsersAndCallback
// Timeout chosen for idle compactions.
compactionIdleTimeout time.Duration
// Number of series in memory, across all tenants.
seriesCount atomic.Int64
// Head compactions metrics.
compactionsTriggered prometheus.Counter
compactionsFailed prometheus.Counter
walReplayTime prometheus.Histogram
appenderAddDuration prometheus.Histogram
appenderCommitDuration prometheus.Histogram
idleTsdbChecks *prometheus.CounterVec
}
type requestWithUsersAndCallback struct {
users *util.AllowedTenants // if nil, all tenants are allowed.
callback chan<- struct{} // when compaction/shipping is finished, this channel is closed
}
func newTSDBState(bucketClient objstore.Bucket, registerer prometheus.Registerer) TSDBState {
idleTsdbChecks := promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
Name: "cortex_ingester_idle_tsdb_checks_total",
Help: "The total number of various results for idle TSDB checks.",
}, []string{"result"})
idleTsdbChecks.WithLabelValues(string(tsdbShippingDisabled))
idleTsdbChecks.WithLabelValues(string(tsdbNotIdle))
idleTsdbChecks.WithLabelValues(string(tsdbNotCompacted))
idleTsdbChecks.WithLabelValues(string(tsdbNotShipped))
idleTsdbChecks.WithLabelValues(string(tsdbCheckFailed))
idleTsdbChecks.WithLabelValues(string(tsdbCloseFailed))
idleTsdbChecks.WithLabelValues(string(tsdbNotActive))
idleTsdbChecks.WithLabelValues(string(tsdbDataRemovalFailed))
idleTsdbChecks.WithLabelValues(string(tsdbTenantMarkedForDeletion))
idleTsdbChecks.WithLabelValues(string(tsdbIdleClosed))
return TSDBState{
dbs: make(map[string]*userTSDB),
bucket: bucketClient,
tsdbMetrics: newTSDBMetrics(registerer),
forceCompactTrigger: make(chan requestWithUsersAndCallback),
shipTrigger: make(chan requestWithUsersAndCallback),
compactionsTriggered: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_ingester_tsdb_compactions_triggered_total",
Help: "Total number of triggered compactions.",
}),
compactionsFailed: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_ingester_tsdb_compactions_failed_total",
Help: "Total number of compactions that failed.",
}),
walReplayTime: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_wal_replay_duration_seconds",
Help: "The total time it takes to open and replay a TSDB WAL.",
Buckets: prometheus.DefBuckets,
}),
appenderAddDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_appender_add_duration_seconds",
Help: "The total time it takes for a push request to add samples to the TSDB appender.",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
}),
appenderCommitDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_appender_commit_duration_seconds",
Help: "The total time it takes for a push request to commit samples appended to TSDB.",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
}),
idleTsdbChecks: idleTsdbChecks,
}
}
// NewV2 returns a new Ingester that uses Cortex block storage instead of chunks storage.
func New(cfg Config, limits *validation.Overrides, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) {
defaultInstanceLimits = &cfg.DefaultLimits
if cfg.ingesterClientFactory == nil {
cfg.ingesterClientFactory = client.MakeIngesterClient
}
bucketClient, err := bucket.NewClient(context.Background(), cfg.BlocksStorageConfig.Bucket, "ingester", logger, registerer)
if err != nil {
return nil, errors.Wrap(err, "failed to create the bucket client")
}
i := &Ingester{
cfg: cfg,
limits: limits,
usersMetadata: map[string]*userMetricsMetadata{},
TSDBState: newTSDBState(bucketClient, registerer),
logger: logger,
ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval),
}
i.metrics = newIngesterMetrics(registerer, false, cfg.ActiveSeriesMetricsEnabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests)
// Replace specific metrics which we can't directly track but we need to read
// them from the underlying system (ie. TSDB).
if registerer != nil {
registerer.Unregister(i.metrics.memSeries)
promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
Name: "cortex_ingester_memory_series",
Help: "The current number of series in memory.",
}, i.getMemorySeriesMetric)
promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
Name: "cortex_ingester_oldest_unshipped_block_timestamp_seconds",
Help: "Unix timestamp of the oldest TSDB block not shipped to the storage yet. 0 if ingester has no blocks or all blocks have been shipped.",
}, i.getOldestUnshippedBlockMetric)
}
i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", RingKey, false, cfg.BlocksStorageConfig.TSDB.FlushBlocksOnShutdown, logger, prometheus.WrapRegistererWithPrefix("cortex_", registerer))
if err != nil {
return nil, err
}
i.subservicesWatcher = services.NewFailureWatcher()
i.subservicesWatcher.WatchService(i.lifecycler)
// Init the limter and instantiate the user states which depend on it
i.limiter = NewLimiter(
limits,
i.lifecycler,
cfg.DistributorShardingStrategy,
cfg.DistributorShardByAllLabels,
cfg.LifecyclerConfig.RingConfig.ReplicationFactor,
cfg.LifecyclerConfig.RingConfig.ZoneAwarenessEnabled)
i.TSDBState.shipperIngesterID = i.lifecycler.ID
// Apply positive jitter only to ensure that the minimum timeout is adhered to.
i.TSDBState.compactionIdleTimeout = util.DurationWithPositiveJitter(i.cfg.BlocksStorageConfig.TSDB.HeadCompactionIdleTimeout, compactionIdleTimeoutJitter)
level.Info(i.logger).Log("msg", "TSDB idle compaction timeout set", "timeout", i.TSDBState.compactionIdleTimeout)
i.BasicService = services.NewBasicService(i.starting, i.updateLoop, i.stopping)
return i, nil
}
// NewForFlusher constructs a new Ingester to be used by flusher target.
// Compared to the 'New' method:
// - Always replays the WAL.
// - Does not start the lifecycler.
//
// this is a special version of ingester used by Flusher. This ingester is not ingesting anything, its only purpose is to react
// on Flush method and flush all openened TSDBs when called.
func NewForFlusher(cfg Config, limits *validation.Overrides, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) {
bucketClient, err := bucket.NewClient(context.Background(), cfg.BlocksStorageConfig.Bucket, "ingester", logger, registerer)
if err != nil {
return nil, errors.Wrap(err, "failed to create the bucket client")
}
i := &Ingester{
cfg: cfg,
limits: limits,
TSDBState: newTSDBState(bucketClient, registerer),
logger: logger,
}
i.metrics = newIngesterMetrics(registerer, false, false, i.getInstanceLimits, nil, &i.inflightPushRequests)
i.TSDBState.shipperIngesterID = "flusher"
// This ingester will not start any subservices (lifecycler, compaction, shipping),
// and will only open TSDBs, wait for Flush to be called, and then close TSDBs again.
i.BasicService = services.NewIdleService(i.startingV2ForFlusher, i.stoppingV2ForFlusher)
return i, nil
}
func (i *Ingester) startingV2ForFlusher(ctx context.Context) error {
if err := i.openExistingTSDB(ctx); err != nil {
// Try to rollback and close opened TSDBs before halting the ingester.
i.closeAllTSDB()
return errors.Wrap(err, "opening existing TSDBs")
}
// Don't start any sub-services (lifecycler, compaction, shipper) at all.
return nil
}
func (i *Ingester) starting(ctx context.Context) error {
// Important: we want to keep lifecycler running until we ask it to stop, so we need to give it independent context
if err := i.lifecycler.StartAsync(context.Background()); err != nil {
return errors.Wrap(err, "failed to start lifecycler")
}
if err := i.lifecycler.AwaitRunning(ctx); err != nil {
return errors.Wrap(err, "failed to start lifecycler")
}
if err := i.openExistingTSDB(ctx); err != nil {
// Try to rollback and close opened TSDBs before halting the ingester.
i.closeAllTSDB()
return errors.Wrap(err, "opening existing TSDBs")
}
i.lifecycler.Join()
// let's start the rest of subservices via manager
servs := []services.Service(nil)
compactionService := services.NewBasicService(nil, i.compactionLoop, nil)
servs = append(servs, compactionService)
if i.cfg.BlocksStorageConfig.TSDB.IsBlocksShippingEnabled() {
shippingService := services.NewBasicService(nil, i.shipBlocksLoop, nil)
servs = append(servs, shippingService)
}
if i.cfg.BlocksStorageConfig.TSDB.CloseIdleTSDBTimeout > 0 {
interval := i.cfg.BlocksStorageConfig.TSDB.CloseIdleTSDBInterval
if interval == 0 {
interval = cortex_tsdb.DefaultCloseIdleTSDBInterval
}
closeIdleService := services.NewTimerService(interval, nil, i.closeAndDeleteIdleUserTSDBs, nil)
servs = append(servs, closeIdleService)
}
var err error
i.TSDBState.subservices, err = services.NewManager(servs...)
if err == nil {
err = services.StartManagerAndAwaitHealthy(ctx, i.TSDBState.subservices)
}
return errors.Wrap(err, "failed to start ingester components")
}
func (i *Ingester) stoppingV2ForFlusher(_ error) error {
if !i.cfg.BlocksStorageConfig.TSDB.KeepUserTSDBOpenOnShutdown {
i.closeAllTSDB()
}
return nil
}
// runs when ingester is stopping
func (i *Ingester) stopping(_ error) error {
// This will prevent us accepting any more samples
i.stopIncomingRequests()
// It's important to wait until shipper is finished,
// because the blocks transfer should start only once it's guaranteed
// there's no shipping on-going.
if err := services.StopManagerAndAwaitStopped(context.Background(), i.TSDBState.subservices); err != nil {
level.Warn(i.logger).Log("msg", "failed to stop ingester subservices", "err", err)
}
// Next initiate our graceful exit from the ring.
if err := services.StopAndAwaitTerminated(context.Background(), i.lifecycler); err != nil {
level.Warn(i.logger).Log("msg", "failed to stop ingester lifecycler", "err", err)
}
if !i.cfg.BlocksStorageConfig.TSDB.KeepUserTSDBOpenOnShutdown {
i.closeAllTSDB()
}
return nil
}
func (i *Ingester) updateLoop(ctx context.Context) error {
if limits := i.getInstanceLimits(); limits != nil && *limits != (InstanceLimits{}) {
// This check will not cover enabling instance limits in runtime, but it will do for now.
logutil.WarnExperimentalUse("ingester instance limits")
}
rateUpdateTicker := time.NewTicker(i.cfg.RateUpdatePeriod)
defer rateUpdateTicker.Stop()
ingestionRateTicker := time.NewTicker(instanceIngestionRateTickInterval)
defer ingestionRateTicker.Stop()
var activeSeriesTickerChan <-chan time.Time
if i.cfg.ActiveSeriesMetricsEnabled {
t := time.NewTicker(i.cfg.ActiveSeriesMetricsUpdatePeriod)
activeSeriesTickerChan = t.C
defer t.Stop()
}
// Similarly to the above, this is a hardcoded value.
metadataPurgeTicker := time.NewTicker(metadataPurgePeriod)
defer metadataPurgeTicker.Stop()
for {
select {
case <-metadataPurgeTicker.C:
i.purgeUserMetricsMetadata()
case <-ingestionRateTicker.C:
i.ingestionRate.Tick()
case <-rateUpdateTicker.C:
i.stoppedMtx.RLock()
for _, db := range i.TSDBState.dbs {
db.ingestedAPISamples.Tick()
db.ingestedRuleSamples.Tick()
}
i.stoppedMtx.RUnlock()
case <-activeSeriesTickerChan:
i.updateActiveSeries()
case <-ctx.Done():
return nil
case err := <-i.subservicesWatcher.Chan():
return errors.Wrap(err, "ingester subservice failed")
}
}
}
func (i *Ingester) updateActiveSeries() {
purgeTime := time.Now().Add(-i.cfg.ActiveSeriesMetricsIdleTimeout)
for _, userID := range i.getTSDBUsers() {
userDB := i.getTSDB(userID)
if userDB == nil {
continue
}
userDB.activeSeries.Purge(purgeTime)
i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(userDB.activeSeries.Active()))
}
}
// ShutdownHandler triggers the following set of operations in order:
// - Change the state of ring to stop accepting writes.
// - Flush all the chunks.
func (i *Ingester) ShutdownHandler(w http.ResponseWriter, _ *http.Request) {
originalFlush := i.lifecycler.FlushOnShutdown()
// We want to flush the chunks if transfer fails irrespective of original flag.
i.lifecycler.SetFlushOnShutdown(true)
// In the case of an HTTP shutdown, we want to unregister no matter what.
originalUnregister := i.lifecycler.ShouldUnregisterOnShutdown()
i.lifecycler.SetUnregisterOnShutdown(true)
_ = services.StopAndAwaitTerminated(context.Background(), i)
// Set state back to original.
i.lifecycler.SetFlushOnShutdown(originalFlush)
i.lifecycler.SetUnregisterOnShutdown(originalUnregister)
w.WriteHeader(http.StatusNoContent)
}
// check that ingester has finished starting, i.e. it is in Running or Stopping state.
// Why Stopping? Because ingester still runs, even when it is transferring data out in Stopping state.
// Ingester handles this state on its own (via `stopped` flag).
func (i *Ingester) checkRunningOrStopping() error {
s := i.State()
if s == services.Running || s == services.Stopping {
return nil
}
return status.Error(codes.Unavailable, s.String())
}
// Using block store, the ingester is only available when it is in a Running state. The ingester is not available
// when stopping to prevent any read or writes to the TSDB after the ingester has closed them.
func (i *Ingester) checkRunning() error {
s := i.State()
if s == services.Running {
return nil
}
return status.Error(codes.Unavailable, s.String())
}
// GetRef() is an extra method added to TSDB to let Cortex check before calling Add()
type extendedAppender interface {
storage.Appender
storage.GetRef
}
// Push adds metrics to a block
func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*cortexpb.WriteResponse, error) {
if err := i.checkRunning(); err != nil {
return nil, err
}
// We will report *this* request in the error too.
inflight := i.inflightPushRequests.Inc()
defer i.inflightPushRequests.Dec()
gl := i.getInstanceLimits()
if gl != nil && gl.MaxInflightPushRequests > 0 {
if inflight > gl.MaxInflightPushRequests {
return nil, errTooManyInflightPushRequests
}
}
var firstPartialErr error
// NOTE: because we use `unsafe` in deserialisation, we must not
// retain anything from `req` past the call to ReuseSlice
defer cortexpb.ReuseSlice(req.Timeseries)
userID, err := tenant.TenantID(ctx)
if err != nil {
return nil, err
}
il := i.getInstanceLimits()
if il != nil && il.MaxIngestionRate > 0 {
if rate := i.ingestionRate.Rate(); rate >= il.MaxIngestionRate {
return nil, errMaxSamplesPushRateLimitReached
}
}
db, err := i.getOrCreateTSDB(userID, false)
if err != nil {
return nil, wrapWithUser(err, userID)
}
// Ensure the ingester shutdown procedure hasn't started
i.stoppedMtx.RLock()
if i.stopped {
i.stoppedMtx.RUnlock()
return nil, errIngesterStopping
}
i.stoppedMtx.RUnlock()
if err := db.acquireAppendLock(); err != nil {
return &cortexpb.WriteResponse{}, httpgrpc.Errorf(http.StatusServiceUnavailable, wrapWithUser(err, userID).Error())
}
defer db.releaseAppendLock()
// Given metadata is a best-effort approach, and we don't halt on errors
// process it before samples. Otherwise, we risk returning an error before ingestion.
ingestedMetadata := i.pushMetadata(ctx, userID, req.GetMetadata())
// Keep track of some stats which are tracked only if the samples will be
// successfully committed
var (
succeededSamplesCount = 0
failedSamplesCount = 0
succeededExemplarsCount = 0
failedExemplarsCount = 0
startAppend = time.Now()
sampleOutOfBoundsCount = 0
sampleOutOfOrderCount = 0
newValueForTimestampCount = 0
perUserSeriesLimitCount = 0
perMetricSeriesLimitCount = 0
updateFirstPartial = func(errFn func() error) {
if firstPartialErr == nil {
firstPartialErr = errFn()
}
}
)
// Walk the samples, appending them to the users database
app := db.Appender(ctx).(extendedAppender)
for _, ts := range req.Timeseries {
// The labels must be sorted (in our case, it's guaranteed a write request
// has sorted labels once hit the ingester).
// Look up a reference for this series.
tsLabels := cortexpb.FromLabelAdaptersToLabels(ts.Labels)
tsLabelsHash := tsLabels.Hash()
ref, copiedLabels := app.GetRef(tsLabels, tsLabelsHash)
// To find out if any sample was added to this series, we keep old value.
oldSucceededSamplesCount := succeededSamplesCount
for _, s := range ts.Samples {
var err error
// If the cached reference exists, we try to use it.
if ref != 0 {
if _, err = app.Append(ref, copiedLabels, s.TimestampMs, s.Value); err == nil {
succeededSamplesCount++
continue
}
} else {
// Copy the label set because both TSDB and the active series tracker may retain it.
copiedLabels = cortexpb.FromLabelAdaptersToLabelsWithCopy(ts.Labels)
// Retain the reference in case there are multiple samples for the series.
if ref, err = app.Append(0, copiedLabels, s.TimestampMs, s.Value); err == nil {
succeededSamplesCount++
continue
}
}
failedSamplesCount++
// Check if the error is a soft error we can proceed on. If so, we keep track