forked from prometheus/prometheus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wal.go
986 lines (866 loc) · 26 KB
/
wal.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bufio"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"os"
"path/filepath"
"sort"
"strconv"
"sync"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/golang/snappy"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/tsdb/fileutil"
)
const (
DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
pageSize = 32 * 1024 // 32KB
recordHeaderSize = 7
WblDirName = "wbl"
)
// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
// page is an in memory buffer used to batch disk writes.
// Records bigger than the page size are split and flushed separately.
// A flush is triggered when a single records doesn't fit the page size or
// when the next record can't fit in the remaining free page space.
type page struct {
alloc int
flushed int
buf [pageSize]byte
}
func (p *page) remaining() int {
return pageSize - p.alloc
}
func (p *page) full() bool {
return pageSize-p.alloc < recordHeaderSize
}
func (p *page) reset() {
for i := range p.buf {
p.buf[i] = 0
}
p.alloc = 0
p.flushed = 0
}
// SegmentFile represents the underlying file used to store a segment.
type SegmentFile interface {
Stat() (os.FileInfo, error)
Sync() error
io.Writer
io.Reader
io.Closer
}
// Segment represents a segment file.
type Segment struct {
SegmentFile
dir string
i int
}
// Index returns the index of the segment.
func (s *Segment) Index() int {
return s.i
}
// Dir returns the directory of the segment.
func (s *Segment) Dir() string {
return s.dir
}
// CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct {
Dir string
Segment int
Offset int64
Err error
}
func (e *CorruptionErr) Error() string {
if e.Segment < 0 {
return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err)
}
return fmt.Sprintf("corruption in segment %s at %d: %s", SegmentName(e.Dir, e.Segment), e.Offset, e.Err)
}
// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends.
func OpenWriteSegment(logger log.Logger, dir string, k int) (*Segment, error) {
segName := SegmentName(dir, k)
f, err := os.OpenFile(segName, os.O_WRONLY|os.O_APPEND, 0o666)
if err != nil {
return nil, err
}
stat, err := f.Stat()
if err != nil {
f.Close()
return nil, err
}
// If the last page is torn, fill it with zeros.
// In case it was torn after all records were written successfully, this
// will just pad the page and everything will be fine.
// If it was torn mid-record, a full read (which the caller should do anyway
// to ensure integrity) will detect it as a corruption by the end.
if d := stat.Size() % pageSize; d != 0 {
level.Warn(logger).Log("msg", "Last page of the wal is torn, filling it with zeros", "segment", segName)
if _, err := f.Write(make([]byte, pageSize-d)); err != nil {
f.Close()
return nil, errors.Wrap(err, "zero-pad torn page")
}
}
return &Segment{SegmentFile: f, i: k, dir: dir}, nil
}
// CreateSegment creates a new segment k in dir.
func CreateSegment(dir string, k int) (*Segment, error) {
f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0o666)
if err != nil {
return nil, err
}
return &Segment{SegmentFile: f, i: k, dir: dir}, nil
}
// OpenReadSegment opens the segment with the given filename.
func OpenReadSegment(fn string) (*Segment, error) {
k, err := strconv.Atoi(filepath.Base(fn))
if err != nil {
return nil, errors.New("not a valid filename")
}
f, err := os.Open(fn)
if err != nil {
return nil, err
}
return &Segment{SegmentFile: f, i: k, dir: filepath.Dir(fn)}, nil
}
// WAL is a write ahead log that stores records in segment files.
// It must be read from start to end once before logging new data.
// If an error occurs during read, the repair procedure must be called
// before it's safe to do further writes.
//
// Segments are written to in pages of 32KB, with records possibly split
// across page boundaries.
// Records are never split across segments to allow full segments to be
// safely truncated. It also ensures that torn writes never corrupt records
// beyond the most recent segment.
type WAL struct {
dir string
logger log.Logger
segmentSize int
mtx sync.RWMutex
segment *Segment // Active segment.
donePages int // Pages written to the segment.
page *page // Active page.
stopc chan chan struct{}
actorc chan func()
closed bool // To allow calling Close() more than once without blocking.
compress bool
snappyBuf []byte
metrics *walMetrics
}
type walMetrics struct {
fsyncDuration prometheus.Summary
pageFlushes prometheus.Counter
pageCompletions prometheus.Counter
truncateFail prometheus.Counter
truncateTotal prometheus.Counter
currentSegment prometheus.Gauge
writesFailed prometheus.Counter
}
func newWALMetrics(r prometheus.Registerer) *walMetrics {
m := &walMetrics{}
m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "fsync_duration_seconds",
Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
})
m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
Name: "page_flushes_total",
Help: "Total number of page flushes.",
})
m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
Name: "completed_pages_total",
Help: "Total number of completed pages.",
})
m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "truncations_failed_total",
Help: "Total number of WAL truncations that failed.",
})
m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "truncations_total",
Help: "Total number of WAL truncations attempted.",
})
m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "segment_current",
Help: "WAL segment index that TSDB is currently writing to.",
})
m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "writes_failed_total",
Help: "Total number of WAL writes that failed.",
})
if r != nil {
r.MustRegister(
m.fsyncDuration,
m.pageFlushes,
m.pageCompletions,
m.truncateFail,
m.truncateTotal,
m.currentSegment,
m.writesFailed,
)
}
return m
}
// New returns a new WAL over the given directory.
func New(logger log.Logger, reg prometheus.Registerer, dir string, compress bool) (*WAL, error) {
return NewSize(logger, reg, dir, DefaultSegmentSize, compress)
}
// NewSize returns a new WAL over the given directory.
// New segments are created with the specified size.
func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int, compress bool) (*WAL, error) {
if segmentSize%pageSize != 0 {
return nil, errors.New("invalid segment size")
}
if err := os.MkdirAll(dir, 0o777); err != nil {
return nil, errors.Wrap(err, "create dir")
}
if logger == nil {
logger = log.NewNopLogger()
}
w := &WAL{
dir: dir,
logger: logger,
segmentSize: segmentSize,
page: &page{},
actorc: make(chan func(), 100),
stopc: make(chan chan struct{}),
compress: compress,
}
prefix := "prometheus_tsdb_wal_"
if filepath.Base(dir) == WblDirName {
prefix = "prometheus_tsdb_out_of_order_wal_"
}
w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg))
_, last, err := Segments(w.Dir())
if err != nil {
return nil, errors.Wrap(err, "get segment range")
}
// Index of the Segment we want to open and write to.
writeSegmentIndex := 0
// If some segments already exist create one with a higher index than the last segment.
if last != -1 {
writeSegmentIndex = last + 1
}
segment, err := CreateSegment(w.Dir(), writeSegmentIndex)
if err != nil {
return nil, err
}
if err := w.setSegment(segment); err != nil {
return nil, err
}
go w.run()
return w, nil
}
// Open an existing WAL.
func Open(logger log.Logger, dir string) (*WAL, error) {
if logger == nil {
logger = log.NewNopLogger()
}
w := &WAL{
dir: dir,
logger: logger,
}
return w, nil
}
// CompressionEnabled returns if compression is enabled on this WAL.
func (w *WAL) CompressionEnabled() bool {
return w.compress
}
// Dir returns the directory of the WAL.
func (w *WAL) Dir() string {
return w.dir
}
func (w *WAL) run() {
Loop:
for {
select {
case f := <-w.actorc:
f()
case donec := <-w.stopc:
close(w.actorc)
defer close(donec)
break Loop
}
}
// Drain and process any remaining functions.
for f := range w.actorc {
f()
}
}
// Repair attempts to repair the WAL based on the error.
// It discards all data after the corruption.
func (w *WAL) Repair(origErr error) error {
// We could probably have a mode that only discards torn records right around
// the corruption to preserve as data much as possible.
// But that's not generally applicable if the records have any kind of causality.
// Maybe as an extra mode in the future if mid-WAL corruptions become
// a frequent concern.
err := errors.Cause(origErr) // So that we can pick up errors even if wrapped.
cerr, ok := err.(*CorruptionErr)
if !ok {
return errors.Wrap(origErr, "cannot handle error")
}
if cerr.Segment < 0 {
return errors.New("corruption error does not specify position")
}
level.Warn(w.logger).Log("msg", "Starting corruption repair",
"segment", cerr.Segment, "offset", cerr.Offset)
// All segments behind the corruption can no longer be used.
segs, err := listSegments(w.Dir())
if err != nil {
return errors.Wrap(err, "list segments")
}
level.Warn(w.logger).Log("msg", "Deleting all segments newer than corrupted segment", "segment", cerr.Segment)
for _, s := range segs {
if w.segment.i == s.index {
// The active segment needs to be removed,
// close it first (Windows!). Can be closed safely
// as we set the current segment to repaired file
// below.
if err := w.segment.Close(); err != nil {
return errors.Wrap(err, "close active segment")
}
}
if s.index <= cerr.Segment {
continue
}
if err := os.Remove(filepath.Join(w.Dir(), s.name)); err != nil {
return errors.Wrapf(err, "delete segment:%v", s.index)
}
}
// Regardless of the corruption offset, no record reaches into the previous segment.
// So we can safely repair the WAL by removing the segment and re-inserting all
// its records up to the corruption.
level.Warn(w.logger).Log("msg", "Rewrite corrupted segment", "segment", cerr.Segment)
fn := SegmentName(w.Dir(), cerr.Segment)
tmpfn := fn + ".repair"
if err := fileutil.Rename(fn, tmpfn); err != nil {
return err
}
// Create a clean segment and make it the active one.
s, err := CreateSegment(w.Dir(), cerr.Segment)
if err != nil {
return err
}
if err := w.setSegment(s); err != nil {
return err
}
f, err := os.Open(tmpfn)
if err != nil {
return errors.Wrap(err, "open segment")
}
defer f.Close()
r := NewReader(bufio.NewReader(f))
for r.Next() {
// Add records only up to the where the error was.
if r.Offset() >= cerr.Offset {
break
}
if err := w.Log(r.Record()); err != nil {
return errors.Wrap(err, "insert record")
}
}
// We expect an error here from r.Err(), so nothing to handle.
// We need to pad to the end of the last page in the repaired segment
if err := w.flushPage(true); err != nil {
return errors.Wrap(err, "flush page in repair")
}
// We explicitly close even when there is a defer for Windows to be
// able to delete it. The defer is in place to close it in-case there
// are errors above.
if err := f.Close(); err != nil {
return errors.Wrap(err, "close corrupted file")
}
if err := os.Remove(tmpfn); err != nil {
return errors.Wrap(err, "delete corrupted segment")
}
// Explicitly close the segment we just repaired to avoid issues with Windows.
s.Close()
// We always want to start writing to a new Segment rather than an existing
// Segment, which is handled by NewSize, but earlier in Repair we're deleting
// all segments that come after the corrupted Segment. Recreate a new Segment here.
s, err = CreateSegment(w.Dir(), cerr.Segment+1)
if err != nil {
return err
}
return w.setSegment(s)
}
// SegmentName builds a segment name for the directory.
func SegmentName(dir string, i int) string {
return filepath.Join(dir, fmt.Sprintf("%08d", i))
}
// NextSegment creates the next segment and closes the previous one asynchronously.
// It returns the file number of the new file.
func (w *WAL) NextSegment() (int, error) {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment(true)
}
// NextSegmentSync creates the next segment and closes the previous one in sync.
// It returns the file number of the new file.
func (w *WAL) NextSegmentSync() (int, error) {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment(false)
}
// nextSegment creates the next segment and closes the previous one.
// It returns the file number of the new file.
func (w *WAL) nextSegment(async bool) (int, error) {
if w.closed {
return 0, errors.New("wal is closed")
}
// Only flush the current page if it actually holds data.
if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil {
return 0, err
}
}
next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
if err != nil {
return 0, errors.Wrap(err, "create new segment file")
}
prev := w.segment
if err := w.setSegment(next); err != nil {
return 0, err
}
// Don't block further writes by fsyncing the last segment.
f := func() {
if err := w.fsync(prev); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
}
if err := prev.Close(); err != nil {
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
}
}
if async {
w.actorc <- f
} else {
f()
}
return next.Index(), nil
}
func (w *WAL) setSegment(segment *Segment) error {
w.segment = segment
// Correctly initialize donePages.
stat, err := segment.Stat()
if err != nil {
return err
}
w.donePages = int(stat.Size() / pageSize)
w.metrics.currentSegment.Set(float64(segment.Index()))
return nil
}
// flushPage writes the new contents of the page to disk. If no more records will fit into
// the page, the remaining bytes will be set to zero and a new page will be started.
// If clear is true, this is enforced regardless of how many bytes are left in the page.
func (w *WAL) flushPage(clear bool) error {
w.metrics.pageFlushes.Inc()
p := w.page
clear = clear || p.full()
// No more data will fit into the page or an implicit clear.
// Enqueue and clear it.
if clear {
p.alloc = pageSize // Write till end of page.
}
n, err := w.segment.Write(p.buf[p.flushed:p.alloc])
if err != nil {
p.flushed += n
return err
}
p.flushed += n
// We flushed an entire page, prepare a new one.
if clear {
p.reset()
w.donePages++
w.metrics.pageCompletions.Inc()
}
return nil
}
// First Byte of header format:
// [ 4 bits unallocated] [1 bit snappy compression flag] [ 3 bit record type ]
const (
snappyMask = 1 << 3
recTypeMask = snappyMask - 1
)
type recType uint8
const (
recPageTerm recType = 0 // Rest of page is empty.
recFull recType = 1 // Full record.
recFirst recType = 2 // First fragment of a record.
recMiddle recType = 3 // Middle fragments of a record.
recLast recType = 4 // Final fragment of a record.
)
func recTypeFromHeader(header byte) recType {
return recType(header & recTypeMask)
}
func (t recType) String() string {
switch t {
case recPageTerm:
return "zero"
case recFull:
return "full"
case recFirst:
return "first"
case recMiddle:
return "middle"
case recLast:
return "last"
default:
return "<invalid>"
}
}
func (w *WAL) pagesPerSegment() int {
return w.segmentSize / pageSize
}
// Log writes the records into the log.
// Multiple records can be passed at once to reduce writes and increase throughput.
func (w *WAL) Log(recs ...[]byte) error {
w.mtx.Lock()
defer w.mtx.Unlock()
// Callers could just implement their own list record format but adding
// a bit of extra logic here frees them from that overhead.
for i, r := range recs {
if err := w.log(r, i == len(recs)-1); err != nil {
w.metrics.writesFailed.Inc()
return err
}
}
return nil
}
// log writes rec to the log and forces a flush of the current page if:
// - the final record of a batch
// - the record is bigger than the page size
// - the current page is full.
func (w *WAL) log(rec []byte, final bool) error {
// When the last page flush failed the page will remain full.
// When the page is full, need to flush it before trying to add more records to it.
if w.page.full() {
if err := w.flushPage(true); err != nil {
return err
}
}
// Compress the record before calculating if a new segment is needed.
compressed := false
if w.compress &&
len(rec) > 0 &&
// If MaxEncodedLen is less than 0 the record is too large to be compressed.
snappy.MaxEncodedLen(len(rec)) >= 0 {
// The snappy library uses `len` to calculate if we need a new buffer.
// In order to allocate as few buffers as possible make the length
// equal to the capacity.
w.snappyBuf = w.snappyBuf[:cap(w.snappyBuf)]
w.snappyBuf = snappy.Encode(w.snappyBuf, rec)
if len(w.snappyBuf) < len(rec) {
rec = w.snappyBuf
compressed = true
}
}
// If the record is too big to fit within the active page in the current
// segment, terminate the active segment and advance to the next one.
// This ensures that records do not cross segment boundaries.
left := w.page.remaining() - recordHeaderSize // Free space in the active page.
left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
if len(rec) > left {
if _, err := w.nextSegment(true); err != nil {
return err
}
}
// Populate as many pages as necessary to fit the record.
// Be careful to always do one pass to ensure we write zero-length records.
for i := 0; i == 0 || len(rec) > 0; i++ {
p := w.page
// Find how much of the record we can fit into the page.
var (
l = min(len(rec), (pageSize-p.alloc)-recordHeaderSize)
part = rec[:l]
buf = p.buf[p.alloc:]
typ recType
)
switch {
case i == 0 && len(part) == len(rec):
typ = recFull
case len(part) == len(rec):
typ = recLast
case i == 0:
typ = recFirst
default:
typ = recMiddle
}
if compressed {
typ |= snappyMask
}
buf[0] = byte(typ)
crc := crc32.Checksum(part, castagnoliTable)
binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
binary.BigEndian.PutUint32(buf[3:], crc)
copy(buf[recordHeaderSize:], part)
p.alloc += len(part) + recordHeaderSize
if w.page.full() {
if err := w.flushPage(true); err != nil {
// TODO When the flushing fails at this point and the record has not been
// fully written to the buffer, we end up with a corrupted WAL because some part of the
// record have been written to the buffer, while the rest of the record will be discarded.
return err
}
}
rec = rec[l:]
}
// If it's the final record of the batch and the page is not empty, flush it.
if final && w.page.alloc > 0 {
if err := w.flushPage(false); err != nil {
return err
}
}
return nil
}
// LastSegmentAndOffset returns the last segment number of the WAL
// and the offset in that file upto which the segment has been filled.
func (w *WAL) LastSegmentAndOffset() (seg, offset int, err error) {
w.mtx.Lock()
defer w.mtx.Unlock()
_, seg, err = Segments(w.Dir())
if err != nil {
return
}
offset = (w.donePages * pageSize) + w.page.alloc
return
}
// Truncate drops all segments before i.
func (w *WAL) Truncate(i int) (err error) {
w.metrics.truncateTotal.Inc()
defer func() {
if err != nil {
w.metrics.truncateFail.Inc()
}
}()
refs, err := listSegments(w.Dir())
if err != nil {
return err
}
for _, r := range refs {
if r.index >= i {
break
}
if err = os.Remove(filepath.Join(w.Dir(), r.name)); err != nil {
return err
}
}
return nil
}
func (w *WAL) fsync(f *Segment) error {
start := time.Now()
err := f.Sync()
w.metrics.fsyncDuration.Observe(time.Since(start).Seconds())
return err
}
// Sync forces a file sync on the current wal segment. This function is meant
// to be used only on tests due to different behaviour on Operating Systems
// like windows and linux
func (w *WAL) Sync() error {
return w.fsync(w.segment)
}
// Close flushes all writes and closes active segment.
func (w *WAL) Close() (err error) {
w.mtx.Lock()
defer w.mtx.Unlock()
if w.closed {
return errors.New("wal already closed")
}
if w.segment == nil {
w.closed = true
return nil
}
// Flush the last page and zero out all its remaining size.
// We must not flush an empty page as it would falsely signal
// the segment is done if we start writing to it again after opening.
if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil {
return err
}
}
donec := make(chan struct{})
w.stopc <- donec
<-donec
if err = w.fsync(w.segment); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
}
if err := w.segment.Close(); err != nil {
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
}
w.closed = true
return nil
}
// Segments returns the range [first, n] of currently existing segments.
// If no segments are found, first and n are -1.
func Segments(walDir string) (first, last int, err error) {
refs, err := listSegments(walDir)
if err != nil {
return 0, 0, err
}
if len(refs) == 0 {
return -1, -1, nil
}
return refs[0].index, refs[len(refs)-1].index, nil
}
type segmentRef struct {
name string
index int
}
func listSegments(dir string) (refs []segmentRef, err error) {
files, err := os.ReadDir(dir)
if err != nil {
return nil, err
}
for _, f := range files {
fn := f.Name()
k, err := strconv.Atoi(fn)
if err != nil {
continue
}
refs = append(refs, segmentRef{name: fn, index: k})
}
sort.Slice(refs, func(i, j int) bool {
return refs[i].index < refs[j].index
})
for i := 0; i < len(refs)-1; i++ {
if refs[i].index+1 != refs[i+1].index {
return nil, errors.New("segments are not sequential")
}
}
return refs, nil
}
// SegmentRange groups segments by the directory and the first and last index it includes.
type SegmentRange struct {
Dir string
First, Last int
}
// NewSegmentsReader returns a new reader over all segments in the directory.
func NewSegmentsReader(dir string) (io.ReadCloser, error) {
return NewSegmentsRangeReader(SegmentRange{dir, -1, -1})
}
// NewSegmentsRangeReader returns a new reader over the given WAL segment ranges.
// If first or last are -1, the range is open on the respective end.
func NewSegmentsRangeReader(sr ...SegmentRange) (io.ReadCloser, error) {
var segs []*Segment
for _, sgmRange := range sr {
refs, err := listSegments(sgmRange.Dir)
if err != nil {
return nil, errors.Wrapf(err, "list segment in dir:%v", sgmRange.Dir)
}
for _, r := range refs {
if sgmRange.First >= 0 && r.index < sgmRange.First {
continue
}
if sgmRange.Last >= 0 && r.index > sgmRange.Last {
break
}
s, err := OpenReadSegment(filepath.Join(sgmRange.Dir, r.name))
if err != nil {
return nil, errors.Wrapf(err, "open segment:%v in dir:%v", r.name, sgmRange.Dir)
}
segs = append(segs, s)
}
}
return NewSegmentBufReader(segs...), nil
}
// segmentBufReader is a buffered reader that reads in multiples of pages.
// The main purpose is that we are able to track segment and offset for
// corruption reporting. We have to be careful not to increment curr too
// early, as it is used by Reader.Err() to tell Repair which segment is corrupt.
// As such we pad the end of non-page align segments with zeros.
type segmentBufReader struct {
buf *bufio.Reader
segs []*Segment
cur int // Index into segs.
off int // Offset of read data into current segment.
}
// nolint:revive // TODO: Consider exporting segmentBufReader
func NewSegmentBufReader(segs ...*Segment) *segmentBufReader {
if len(segs) == 0 {
return &segmentBufReader{}
}
return &segmentBufReader{
buf: bufio.NewReaderSize(segs[0], 16*pageSize),
segs: segs,
}
}
// nolint:revive
func NewSegmentBufReaderWithOffset(offset int, segs ...*Segment) (sbr *segmentBufReader, err error) {
if offset == 0 || len(segs) == 0 {
return NewSegmentBufReader(segs...), nil
}
sbr = &segmentBufReader{
buf: bufio.NewReaderSize(segs[0], 16*pageSize),
segs: segs,
}
if offset > 0 {
_, err = sbr.buf.Discard(offset)
}
return sbr, err
}
func (r *segmentBufReader) Close() (err error) {
for _, s := range r.segs {
if e := s.Close(); e != nil {
err = e
}
}
return err
}
// Read implements io.Reader.
func (r *segmentBufReader) Read(b []byte) (n int, err error) {
if len(r.segs) == 0 {
return 0, io.EOF
}
n, err = r.buf.Read(b)
r.off += n
// If we succeeded, or hit a non-EOF, we can stop.
if err == nil || err != io.EOF {
return n, err
}
// We hit EOF; fake out zero padding at the end of short segments, so we
// don't increment curr too early and report the wrong segment as corrupt.
if r.off%pageSize != 0 {
i := 0
for ; n+i < len(b) && (r.off+i)%pageSize != 0; i++ {
b[n+i] = 0
}
// Return early, even if we didn't fill b.
r.off += i
return n + i, nil
}
// There is no more deta left in the curr segment and there are no more
// segments left. Return EOF.
if r.cur+1 >= len(r.segs) {
return n, io.EOF
}
// Move to next segment.
r.cur++
r.off = 0
r.buf.Reset(r.segs[r.cur])
return n, nil
}
// Computing size of the WAL.
// We do this by adding the sizes of all the files under the WAL dir.
func (w *WAL) Size() (int64, error) {
return fileutil.DirSize(w.Dir())
}