-
Notifications
You must be signed in to change notification settings - Fork 466
/
version.go
1678 lines (1536 loc) · 59.7 KB
/
version.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package manifest
import (
"bytes"
stdcmp "cmp"
"fmt"
"slices"
"sort"
"strings"
"sync"
"sync/atomic"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/invariants"
"github.com/cockroachdb/pebble/sstable"
)
// Compare exports the base.Compare type.
type Compare = base.Compare
// InternalKey exports the base.InternalKey type.
type InternalKey = base.InternalKey
// TableInfo contains the common information for table related events.
type TableInfo struct {
// FileNum is the internal DB identifier for the table.
FileNum base.FileNum
// Size is the size of the file in bytes.
Size uint64
// Smallest is the smallest internal key in the table.
Smallest InternalKey
// Largest is the largest internal key in the table.
Largest InternalKey
// SmallestSeqNum is the smallest sequence number in the table.
SmallestSeqNum uint64
// LargestSeqNum is the largest sequence number in the table.
LargestSeqNum uint64
}
// TableStats contains statistics on a table used for compaction heuristics,
// and export via Metrics.
type TableStats struct {
// The total number of entries in the table.
NumEntries uint64
// The number of point and range deletion entries in the table.
NumDeletions uint64
// NumRangeKeySets is the total number of range key sets in the table.
//
// NB: If there's a chance that the sstable contains any range key sets,
// then NumRangeKeySets must be > 0.
NumRangeKeySets uint64
// Estimate of the total disk space that may be dropped by this table's
// point deletions by compacting them.
PointDeletionsBytesEstimate uint64
// Estimate of the total disk space that may be dropped by this table's
// range deletions by compacting them. This estimate is at data-block
// granularity and is not updated if compactions beneath the table reduce
// the amount of reclaimable disk space. It also does not account for
// overlapping data in L0 and ignores L0 sublevels, but the error that
// introduces is expected to be small.
//
// Tables in the bottommost level of the LSM may have a nonzero estimate if
// snapshots or move compactions prevented the elision of their range
// tombstones. A table in the bottommost level that was ingested into L6
// will have a zero estimate, because the file's sequence numbers indicate
// that the tombstone cannot drop any data contained within the file itself.
RangeDeletionsBytesEstimate uint64
// Total size of value blocks and value index block.
ValueBlocksSize uint64
}
// boundType represents the type of key (point or range) present as the smallest
// and largest keys.
type boundType uint8
const (
boundTypePointKey boundType = iota + 1
boundTypeRangeKey
)
// CompactionState is the compaction state of a file.
//
// The following shows the valid state transitions:
//
// NotCompacting --> Compacting --> Compacted
// ^ |
// | |
// +-------<-------+
//
// Input files to a compaction transition to Compacting when a compaction is
// picked. A file that has finished compacting typically transitions into the
// Compacted state, at which point it is effectively obsolete ("zombied") and
// will eventually be removed from the LSM. A file that has been move-compacted
// will transition from Compacting back into the NotCompacting state, signaling
// that the file may be selected for a subsequent compaction. A failed
// compaction will result in all input tables transitioning from Compacting to
// NotCompacting.
//
// This state is in-memory only. It is not persisted to the manifest.
type CompactionState uint8
// CompactionStates.
const (
CompactionStateNotCompacting CompactionState = iota
CompactionStateCompacting
CompactionStateCompacted
)
// String implements fmt.Stringer.
func (s CompactionState) String() string {
switch s {
case CompactionStateNotCompacting:
return "NotCompacting"
case CompactionStateCompacting:
return "Compacting"
case CompactionStateCompacted:
return "Compacted"
default:
panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
}
}
// FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
// some version. FileMetadata does not contain the actual level of the sst,
// since such leveled-ssts can move across levels in different versions, while
// sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
// and virtual. Underlying both leveled-ssts is a backing-sst, for which the
// only state is FileBacking. A backing-sst is level-less. It is possible for a
// backing-sst to be referred to by a physical sst in one version and by one or
// more virtual ssts in one or more versions. A backing-sst becomes obsolete
// and can be deleted once it is no longer required by any physical or virtual
// sst in any version.
//
// We maintain some invariants:
//
// 1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
// and there will be exactly one FileMetadata associated with the FileNum.
//
// 2. Within a version, a backing-sst is either only referred to by one
// physical sst or one or more virtual ssts.
//
// 3. Once a backing-sst is referred to by a virtual sst in the latest version,
// it cannot go back to being referred to by a physical sst in any future
// version.
//
// Once a physical sst is no longer needed by any version, we will no longer
// maintain the file metadata associated with it. We will still maintain the
// FileBacking associated with the physical sst if the backing sst is required
// by any virtual ssts in any version.
type FileMetadata struct {
// AllowedSeeks is used to determine if a file should be picked for
// a read triggered compaction. It is decremented when read sampling
// in pebble.Iterator after every after every positioning operation
// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
AllowedSeeks atomic.Int64
// statsValid indicates if stats have been loaded for the table. The
// TableStats structure is populated only if valid is true.
statsValid atomic.Bool
// FileBacking is the state which backs either a physical or virtual
// sstables.
FileBacking *FileBacking
// InitAllowedSeeks is the inital value of allowed seeks. This is used
// to re-set allowed seeks on a file once it hits 0.
InitAllowedSeeks int64
// FileNum is the file number.
//
// INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
FileNum base.FileNum
// Size is the size of the file, in bytes. Size is an approximate value for
// virtual sstables.
//
// INVARIANTS:
// - When !FileMetadata.Virtual, Size == FileBacking.Size.
// - Size should be non-zero. Size 0 virtual sstables must not be created.
Size uint64
// File creation time in seconds since the epoch (1970-01-01 00:00:00
// UTC). For ingested sstables, this corresponds to the time the file was
// ingested. For virtual sstables, this corresponds to the wall clock time
// when the FileMetadata for the virtual sstable was first created.
CreationTime int64
// Lower and upper bounds for the smallest and largest sequence numbers in
// the table, across both point and range keys. For physical sstables, these
// values are tight bounds. For virtual sstables, there is no guarantee that
// there will be keys with SmallestSeqNum or LargestSeqNum within virtual
// sstable bounds.
SmallestSeqNum uint64
LargestSeqNum uint64
// SmallestPointKey and LargestPointKey are the inclusive bounds for the
// internal point keys stored in the table. This includes RANGEDELs, which
// alter point keys.
// NB: these field should be set using ExtendPointKeyBounds. They are left
// exported for reads as an optimization.
SmallestPointKey InternalKey
LargestPointKey InternalKey
// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
// internal range keys stored in the table.
// NB: these field should be set using ExtendRangeKeyBounds. They are left
// exported for reads as an optimization.
SmallestRangeKey InternalKey
LargestRangeKey InternalKey
// Smallest and Largest are the inclusive bounds for the internal keys stored
// in the table, across both point and range keys.
// NB: these fields are derived from their point and range key equivalents,
// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
Smallest InternalKey
Largest InternalKey
// Stats describe table statistics. Protected by DB.mu.
//
// For virtual sstables, set stats upon virtual sstable creation as
// asynchronous computation of stats is not currently supported.
//
// TODO(bananabrick): To support manifest replay for virtual sstables, we
// probably need to compute virtual sstable stats asynchronously. Otherwise,
// we'd have to write virtual sstable stats to the version edit.
Stats TableStats
// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
// pick L0 compactions. Only accurate for the most recent Version.
SubLevel int
L0Index int
minIntervalIndex int
maxIntervalIndex int
// NB: the alignment of this struct is 8 bytes. We pack all the bools to
// ensure an optimal packing.
// IsIntraL0Compacting is set to True if this file is part of an intra-L0
// compaction. When it's true, IsCompacting must also return true. If
// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
// file must be part of a compaction to Lbase.
IsIntraL0Compacting bool
CompactionState CompactionState
// True if compaction of this file has been explicitly requested.
// Previously, RocksDB and earlier versions of Pebble allowed this
// flag to be set by a user table property collector. Some earlier
// versions of Pebble respected this flag, while other more recent
// versions ignored this flag.
//
// More recently this flag has been repurposed to facilitate the
// compaction of 'atomic compaction units'. Files marked for
// compaction are compacted in a rewrite compaction at the lowest
// possible compaction priority.
//
// NB: A count of files marked for compaction is maintained on
// Version, and compaction picking reads cached annotations
// determined by this field.
//
// Protected by DB.mu.
MarkedForCompaction bool
// HasPointKeys tracks whether the table contains point keys (including
// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
// still true.
HasPointKeys bool
// HasRangeKeys tracks whether the table contains any range keys.
HasRangeKeys bool
// smallestSet and largestSet track whether the overall bounds have been set.
boundsSet bool
// boundTypeSmallest and boundTypeLargest provide an indication as to which
// key type (point or range) corresponds to the smallest and largest overall
// table bounds.
boundTypeSmallest, boundTypeLargest boundType
// Virtual is true if the FileMetadata belongs to a virtual sstable.
Virtual bool
// PrefixReplacement is used for virtual files where the backing file has a
// different prefix on its keys than the span in which it is being exposed.
PrefixReplacement *sstable.PrefixReplacement
SyntheticSuffix sstable.SyntheticSuffix
}
// InternalKeyBounds returns the set of overall table bounds.
func (m *FileMetadata) InternalKeyBounds() (InternalKey, InternalKey) {
return m.Smallest, m.Largest
}
// SyntheticSeqNum returns a SyntheticSeqNum which is set when SmallestSeqNum
// equals LargestSeqNum.
func (m *FileMetadata) SyntheticSeqNum() sstable.SyntheticSeqNum {
if m.SmallestSeqNum == m.LargestSeqNum {
return sstable.SyntheticSeqNum(m.SmallestSeqNum)
}
return sstable.NoSyntheticSeqNum
}
// IterTransforms returns an sstable.IterTransforms that has SyntheticSeqNum set as needed.
func (m *FileMetadata) IterTransforms() sstable.IterTransforms {
var syntheticPrefix []byte
if m.PrefixReplacement != nil && !m.PrefixReplacement.UsePrefixReplacementIterator() {
syntheticPrefix = m.PrefixReplacement.SyntheticPrefix
}
return sstable.IterTransforms{
SyntheticSeqNum: m.SyntheticSeqNum(),
SyntheticSuffix: m.SyntheticSuffix,
SyntheticPrefix: syntheticPrefix,
}
}
// PhysicalFileMeta is used by functions which want a guarantee that their input
// belongs to a physical sst and not a virtual sst.
//
// NB: This type should only be constructed by calling
// FileMetadata.PhysicalMeta.
type PhysicalFileMeta struct {
*FileMetadata
}
// VirtualFileMeta is used by functions which want a guarantee that their input
// belongs to a virtual sst and not a physical sst.
//
// A VirtualFileMeta inherits all the same fields as a FileMetadata. These
// fields have additional invariants imposed on them, and/or slightly varying
// meanings:
// - Smallest and Largest (and their counterparts
// {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
// key at that exact bound. We make the effort to determine the next smallest
// or largest key in an sstable after virtualizing it, to maintain this
// tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
// could mean that a rangedel or range key ends at that user key, or has been
// truncated to that user key.
// - One invariant is that if a rangedel or range key is truncated on its
// upper bound, the virtual sstable *must* have a rangedel or range key
// sentinel key as its upper bound. This is because truncation yields
// an exclusive upper bound for the rangedel/rangekey, and if there are
// any points at that exclusive upper bound within the same virtual
// sstable, those could get uncovered by this truncation. We enforce this
// invariant in calls to keyspan.Truncate.
// - Size is an estimate of the size of the virtualized portion of this sstable.
// The underlying file's size is stored in FileBacking.Size, though it could
// also be estimated or could correspond to just the referenced portion of
// a file (eg. if the file originated on another node).
// - Size must be > 0.
// - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
// This means that all keys in the virtual sstable must have seqnums within
// [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
// a key with a seqnum at either of the bounds. Calculating tight seqnum
// bounds would be too expensive and deliver little value.
//
// NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
type VirtualFileMeta struct {
*FileMetadata
}
// VirtualReaderParams fills in the parameters necessary to create a virtual
// sstable reader.
func (m VirtualFileMeta) VirtualReaderParams(isShared bool) sstable.VirtualReaderParams {
return sstable.VirtualReaderParams{
Lower: m.Smallest,
Upper: m.Largest,
FileNum: m.FileNum,
IsSharedIngested: isShared && m.SyntheticSeqNum() != 0,
Size: m.Size,
BackingSize: m.FileBacking.Size,
PrefixReplacement: m.PrefixReplacement,
}
}
// PhysicalMeta should be the only source of creating the PhysicalFileMeta
// wrapper type.
func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
if m.Virtual {
panic("pebble: file metadata does not belong to a physical sstable")
}
return PhysicalFileMeta{
m,
}
}
// VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
// type.
func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
if !m.Virtual {
panic("pebble: file metadata does not belong to a virtual sstable")
}
return VirtualFileMeta{
m,
}
}
// FileBacking either backs a single physical sstable, or one or more virtual
// sstables.
//
// See the comment above the FileMetadata type for sstable terminology.
type FileBacking struct {
DiskFileNum base.DiskFileNum
Size uint64
// Reference count for the backing file, used to determine when a backing file
// is obsolete and can be removed.
//
// The reference count is at least the number of distinct tables that use this
// backing across all versions that have a non-zero reference count. The tables
// in each version are maintained in a copy-on-write B-tree and each B-tree node
// keeps a reference on the respective backings.
//
// In addition, a reference count is taken for every backing in the latest
// version's VirtualBackings (necessary to support Protect/Unprotect).
refs atomic.Int32
}
// MustHaveRefs asserts that the backing has a positive refcount.
func (b *FileBacking) MustHaveRefs() {
if refs := b.refs.Load(); refs <= 0 {
panic(errors.AssertionFailedf("backing %s must have positive refcount (refs=%d)",
b.DiskFileNum, refs))
}
}
// Ref increments the backing's ref count.
func (b *FileBacking) Ref() {
b.refs.Add(1)
}
// Unref decrements the backing's ref count (and returns the new count).
func (b *FileBacking) Unref() int32 {
v := b.refs.Add(-1)
if invariants.Enabled && v < 0 {
panic("pebble: invalid FileMetadata refcounting")
}
return v
}
// InitPhysicalBacking allocates and sets the FileBacking which is required by a
// physical sstable FileMetadata.
//
// Ensure that the state required by FileBacking, such as the FileNum, is
// already set on the FileMetadata before InitPhysicalBacking is called.
// Calling InitPhysicalBacking only after the relevant state has been set in the
// FileMetadata is not necessary in tests which don't rely on FileBacking.
func (m *FileMetadata) InitPhysicalBacking() {
if m.Virtual {
panic("pebble: virtual sstables should use a pre-existing FileBacking")
}
if m.FileBacking == nil {
m.FileBacking = &FileBacking{
DiskFileNum: base.PhysicalTableDiskFileNum(m.FileNum),
Size: m.Size,
}
}
}
// InitProviderBacking creates a new FileBacking for a file backed by
// an objstorage.Provider.
func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum, size uint64) {
if !m.Virtual {
panic("pebble: provider-backed sstables must be virtual")
}
if m.FileBacking == nil {
m.FileBacking = &FileBacking{DiskFileNum: fileNum}
}
m.FileBacking.Size = size
}
// ValidateVirtual should be called once the FileMetadata for a virtual sstable
// is created to verify that the fields of the virtual sstable are sound.
func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
if !m.Virtual {
panic("pebble: invalid virtual sstable")
}
if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
panic("pebble: invalid smallest sequence number for virtual sstable")
}
if createdFrom.LargestSeqNum != m.LargestSeqNum {
panic("pebble: invalid largest sequence number for virtual sstable")
}
if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
panic("pebble: invalid physical sstable state for virtual sstable")
}
if m.Size == 0 {
panic("pebble: virtual sstable size must be set upon creation")
}
}
// SetCompactionState transitions this file's compaction state to the given
// state. Protected by DB.mu.
func (m *FileMetadata) SetCompactionState(to CompactionState) {
if invariants.Enabled {
transitionErr := func() error {
return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
}
switch m.CompactionState {
case CompactionStateNotCompacting:
if to != CompactionStateCompacting {
panic(transitionErr())
}
case CompactionStateCompacting:
if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
panic(transitionErr())
}
case CompactionStateCompacted:
panic(transitionErr())
default:
panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
}
}
m.CompactionState = to
}
// IsCompacting returns true if this file's compaction state is
// CompactionStateCompacting. Protected by DB.mu.
func (m *FileMetadata) IsCompacting() bool {
return m.CompactionState == CompactionStateCompacting
}
// StatsValid returns true if the table stats have been populated. If StatValid
// returns true, the Stats field may be read (with or without holding the
// database mutex).
func (m *FileMetadata) StatsValid() bool {
return m.statsValid.Load()
}
// StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
// while populating TableStats and calling StatsMarkValud. Once stats are
// populated, they must not be mutated.
func (m *FileMetadata) StatsMarkValid() {
m.statsValid.Store(true)
}
// ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
// and overall table bounds with the given smallest and largest keys. The
// smallest and largest bounds may not be extended if the table already has a
// bound that is smaller or larger, respectively. The receiver is returned.
// NB: calling this method should be preferred to manually setting the bounds by
// manipulating the fields directly, to maintain certain invariants.
func (m *FileMetadata) ExtendPointKeyBounds(
cmp Compare, smallest, largest InternalKey,
) *FileMetadata {
// Update the point key bounds.
if !m.HasPointKeys {
m.SmallestPointKey, m.LargestPointKey = smallest, largest
m.HasPointKeys = true
} else {
if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
m.SmallestPointKey = smallest
}
if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
m.LargestPointKey = largest
}
}
// Update the overall bounds.
m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
return m
}
// ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
// and overall table bounds with the given smallest and largest keys. The
// smallest and largest bounds may not be extended if the table already has a
// bound that is smaller or larger, respectively. The receiver is returned.
// NB: calling this method should be preferred to manually setting the bounds by
// manipulating the fields directly, to maintain certain invariants.
func (m *FileMetadata) ExtendRangeKeyBounds(
cmp Compare, smallest, largest InternalKey,
) *FileMetadata {
// Update the range key bounds.
if !m.HasRangeKeys {
m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
m.HasRangeKeys = true
} else {
if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
m.SmallestRangeKey = smallest
}
if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
m.LargestRangeKey = largest
}
}
// Update the overall bounds.
m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
return m
}
// extendOverallBounds attempts to extend the overall table lower and upper
// bounds. The given bounds may not be used if a lower or upper bound already
// exists that is smaller or larger than the given keys, respectively. The given
// boundType will be used if the bounds are updated.
func (m *FileMetadata) extendOverallBounds(
cmp Compare, smallest, largest InternalKey, bTyp boundType,
) {
if !m.boundsSet {
m.Smallest, m.Largest = smallest, largest
m.boundsSet = true
m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
} else {
if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
m.Smallest = smallest
m.boundTypeSmallest = bTyp
}
if base.InternalCompare(cmp, largest, m.Largest) > 0 {
m.Largest = largest
m.boundTypeLargest = bTyp
}
}
}
// Overlaps returns true if the file key range overlaps with the given range.
func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
// f is completely before the specified range; no overlap.
return false
}
if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
// f is completely after the specified range; no overlap.
return false
}
return true
}
// ContainedWithinSpan returns true if the file key range completely overlaps with the
// given range ("end" is assumed to exclusive).
func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
}
// ContainsKeyType returns whether or not the file contains keys of the provided
// type.
func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
switch kt {
case KeyTypePointAndRange:
return true
case KeyTypePoint:
return m.HasPointKeys
case KeyTypeRange:
return m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
// SmallestBound returns the file's smallest bound of the key type. It returns a
// false second return value if the file does not contain any keys of the key
// type.
func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
switch kt {
case KeyTypePointAndRange:
return &m.Smallest, true
case KeyTypePoint:
return &m.SmallestPointKey, m.HasPointKeys
case KeyTypeRange:
return &m.SmallestRangeKey, m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
// LargestBound returns the file's largest bound of the key type. It returns a
// false second return value if the file does not contain any keys of the key
// type.
func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
switch kt {
case KeyTypePointAndRange:
return &m.Largest, true
case KeyTypePoint:
return &m.LargestPointKey, m.HasPointKeys
case KeyTypeRange:
return &m.LargestRangeKey, m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
const (
maskContainsPointKeys = 1 << 0
maskSmallest = 1 << 1
maskLargest = 1 << 2
)
// boundsMarker returns a marker byte whose bits encode the following
// information (in order from least significant bit):
// - if the table contains point keys
// - if the table's smallest key is a point key
// - if the table's largest key is a point key
func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
if m.HasPointKeys {
sentinel |= maskContainsPointKeys
}
switch m.boundTypeSmallest {
case boundTypePointKey:
sentinel |= maskSmallest
case boundTypeRangeKey:
// No op - leave bit unset.
default:
return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
}
switch m.boundTypeLargest {
case boundTypePointKey:
sentinel |= maskLargest
case boundTypeRangeKey:
// No op - leave bit unset.
default:
return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
}
return
}
// String implements fmt.Stringer, printing the file number and the overall
// table bounds.
func (m *FileMetadata) String() string {
return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
}
// DebugString returns a verbose representation of FileMetadata, typically for
// use in tests and debugging, returning the file number and the point, range
// and overall bounds for the table.
func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
var b bytes.Buffer
if m.Virtual {
fmt.Fprintf(&b, "%s(%s):[%s-%s]",
m.FileNum, m.FileBacking.DiskFileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
} else {
fmt.Fprintf(&b, "%s:[%s-%s]",
m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
}
if !verbose {
return b.String()
}
fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
if m.HasPointKeys {
fmt.Fprintf(&b, " points:[%s-%s]",
m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
}
if m.HasRangeKeys {
fmt.Fprintf(&b, " ranges:[%s-%s]",
m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
}
if m.Size != 0 {
fmt.Fprintf(&b, " size:%d", m.Size)
}
return b.String()
}
// ParseFileMetadataDebug parses a FileMetadata from its DebugString
// representation.
func ParseFileMetadataDebug(s string) (_ *FileMetadata, err error) {
defer func() {
err = errors.CombineErrors(err, maybeRecover())
}()
// Input format:
// 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
m := &FileMetadata{}
p := makeDebugParser(s)
m.FileNum = p.FileNum()
var backingNum base.DiskFileNum
if p.Peek() == "(" {
p.Expect("(")
backingNum = p.DiskFileNum()
p.Expect(")")
}
p.Expect(":", "[")
m.Smallest = p.InternalKey()
p.Expect("-")
m.Largest = p.InternalKey()
p.Expect("]")
for !p.Done() {
field := p.Next()
p.Expect(":")
switch field {
case "seqnums":
p.Expect("[")
m.SmallestSeqNum = p.Uint64()
p.Expect("-")
m.LargestSeqNum = p.Uint64()
p.Expect("]")
case "points":
p.Expect("[")
m.SmallestPointKey = p.InternalKey()
p.Expect("-")
m.LargestPointKey = p.InternalKey()
m.HasPointKeys = true
p.Expect("]")
case "ranges":
p.Expect("[")
m.SmallestRangeKey = p.InternalKey()
p.Expect("-")
m.LargestRangeKey = p.InternalKey()
m.HasRangeKeys = true
p.Expect("]")
case "size":
m.Size = p.Uint64()
default:
p.Errf("unknown field %q", field)
}
}
// By default, when the parser sees just the overall bounds, we set the point
// keys. This preserves backwards compatability with existing test cases that
// specify only the overall bounds.
if !m.HasPointKeys && !m.HasRangeKeys {
m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
m.HasPointKeys = true
}
if backingNum == 0 {
m.InitPhysicalBacking()
} else {
m.Virtual = true
m.InitProviderBacking(backingNum, 0 /* size */)
}
return m, nil
}
// Validate validates the metadata for consistency with itself, returning an
// error if inconsistent.
func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
// Combined range and point key validation.
if !m.HasPointKeys && !m.HasRangeKeys {
return base.CorruptionErrorf("file %s has neither point nor range keys",
errors.Safe(m.FileNum))
}
if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
m.Largest.Pretty(formatKey))
}
if m.SmallestSeqNum > m.LargestSeqNum {
return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
}
// Point key validation.
if m.HasPointKeys {
if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
m.LargestPointKey.Pretty(formatKey))
}
if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
return base.CorruptionErrorf(
"file %s has inconsistent point key bounds relative to overall bounds: "+
"overall = [%s-%s], point keys = [%s-%s]",
errors.Safe(m.FileNum),
m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
)
}
}
// Range key validation.
if m.HasRangeKeys {
if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
m.LargestRangeKey.Pretty(formatKey))
}
if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
return base.CorruptionErrorf(
"file %s has inconsistent range key bounds relative to overall bounds: "+
"overall = [%s-%s], range keys = [%s-%s]",
errors.Safe(m.FileNum),
m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
)
}
}
// Ensure that FileMetadata.Init was called.
if m.FileBacking == nil {
return base.CorruptionErrorf("file metadata FileBacking not set")
}
if m.PrefixReplacement != nil {
if !m.Virtual {
return base.CorruptionErrorf("prefix replacement rule set with non-virtual file")
}
if !bytes.HasPrefix(m.Smallest.UserKey, m.PrefixReplacement.SyntheticPrefix) {
return base.CorruptionErrorf("virtual file with prefix replacement rules has smallest key with a different prefix: %s", m.Smallest.Pretty(formatKey))
}
if !bytes.HasPrefix(m.Largest.UserKey, m.PrefixReplacement.SyntheticPrefix) {
return base.CorruptionErrorf("virtual file with prefix replacement rules has largest key with a different prefix: %s", m.Largest.Pretty(formatKey))
}
}
if m.SyntheticSuffix != nil {
if !m.Virtual {
return base.CorruptionErrorf("suffix replacement rule set with non-virtual file")
}
}
return nil
}
// TableInfo returns a subset of the FileMetadata state formatted as a
// TableInfo.
func (m *FileMetadata) TableInfo() TableInfo {
return TableInfo{
FileNum: m.FileNum,
Size: m.Size,
Smallest: m.Smallest,
Largest: m.Largest,
SmallestSeqNum: m.SmallestSeqNum,
LargestSeqNum: m.LargestSeqNum,
}
}
func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
// NB: This is the same ordering that RocksDB uses for L0 files.
// Sort first by largest sequence number.
if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
return v
}
// Then by smallest sequence number.
if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
return v
}
// Break ties by file number.
return stdcmp.Compare(m.FileNum, b.FileNum)
}
func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
return m.cmpSeqNum(b) < 0
}
func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
return base.InternalCompare(cmp, m.Smallest, b.Smallest)
}
// KeyRange returns the minimum smallest and maximum largest internalKey for
// all the FileMetadata in iters.
func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
first := true
for _, iter := range iters {
for meta := iter.First(); meta != nil; meta = iter.Next() {
if first {
first = false
smallest, largest = meta.Smallest, meta.Largest
continue
}
if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
smallest = meta.Smallest
}
if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
largest = meta.Largest
}
}
}
return smallest, largest
}
type bySeqNum []*FileMetadata
func (b bySeqNum) Len() int { return len(b) }
func (b bySeqNum) Less(i, j int) bool {
return b[i].lessSeqNum(b[j])
}
func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
// SortBySeqNum sorts the specified files by increasing sequence number.
func SortBySeqNum(files []*FileMetadata) {
sort.Sort(bySeqNum(files))
}
type bySmallest struct {
files []*FileMetadata
cmp Compare
}
func (b bySmallest) Len() int { return len(b.files) }
func (b bySmallest) Less(i, j int) bool {
return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
}
func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
// SortBySmallest sorts the specified files by smallest key using the supplied
// comparison function to order user keys.
func SortBySmallest(files []*FileMetadata, cmp Compare) {
sort.Sort(bySmallest{files, cmp})
}
func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
startIter := iter.Clone()
{
startIterFile := startIter.SeekGE(cmp, start)
// SeekGE compares user keys. The user key `start` may be equal to the
// f.Largest because f.Largest is a range deletion sentinel, indicating
// that the user key `start` is NOT contained within the file f. If
// that's the case, we can narrow the overlapping bounds to exclude the
// file with the sentinel.
if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&