forked from cockroachdb/pebble
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mem_table.go
288 lines (260 loc) · 8.73 KB
/
mem_table.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package pebble
import (
"fmt"
"sync"
"sync/atomic"
"unsafe"
"github.com/cockroachdb/pebble/internal/arenaskl"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/rangedel"
)
func memTableEntrySize(keyBytes, valueBytes int) uint32 {
return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes))
}
// A memTable implements an in-memory layer of the LSM. A memTable is mutable,
// but append-only. Records are added, but never removed. Deletion is supported
// via tombstones, but it is up to higher level code (see Iterator) to support
// processing those tombstones.
//
// A memTable is implemented on top of a lock-free arena-backed skiplist. An
// arena is a fixed size contiguous chunk of memory (see
// Options.MemTableSize). A memTable's memory consumtion is thus fixed at
// the time of creation (with the exception of the cached fragmented range
// tombstones). The arena-backed skiplist provides both forward and reverse
// links which makes forward and reverse iteration the same speed.
//
// A batch is "applied" to a memTable in a two step process: prepare(batch) ->
// apply(batch). memTable.prepare() is not thread-safe and must be called with
// external sychronization. Preparation reserves space in the memTable for the
// batch. Note that we pessimistically compute how much space a batch will
// consume in the memTable (see memTableEntrySize and
// Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to
// the memTable can be performed concurrently with other apply
// operations. Applying a batch is an O(n logm) operation where N is the number
// of records in the batch and M is the number of records in the memtable. The
// commitPipeline serializes batch preparation, and allows batch application to
// proceed concurrently.
//
// It is safe to call get, apply, newIter, and newRangeDelIter concurrently.
type memTable struct {
cmp Compare
equal Equal
skl arenaskl.Skiplist
rangeDelSkl arenaskl.Skiplist
emptySize uint32
reserved uint32
refs int32
flushedCh chan struct{}
tombstones rangeTombstoneCache
logNum uint64
logSize uint64
}
// newMemTable returns a new MemTable.
func newMemTable(o *Options) *memTable {
o = o.EnsureDefaults()
m := &memTable{
cmp: o.Comparer.Compare,
equal: o.Comparer.Equal,
refs: 1,
flushedCh: make(chan struct{}),
}
arena := arenaskl.NewArena(uint32(o.MemTableSize), 0)
m.skl.Reset(arena, m.cmp)
m.rangeDelSkl.Reset(arena, m.cmp)
m.emptySize = arena.Size()
return m
}
func (m *memTable) ref() {
atomic.AddInt32(&m.refs, 1)
}
func (m *memTable) unref() bool {
switch v := atomic.AddInt32(&m.refs, -1); {
case v < 0:
panic("pebble: inconsistent reference count")
case v == 0:
return true
default:
return false
}
}
func (m *memTable) flushed() chan struct{} {
return m.flushedCh
}
func (m *memTable) readyForFlush() bool {
return atomic.LoadInt32(&m.refs) == 0
}
func (m *memTable) logInfo() (uint64, uint64) {
return m.logNum, m.logSize
}
// Get gets the value for the given key. It returns ErrNotFound if the DB does
// not contain the key.
func (m *memTable) get(key []byte) (value []byte, err error) {
it := m.skl.NewIter(nil, nil)
ikey, val := it.SeekGE(key)
if ikey == nil {
return nil, ErrNotFound
}
if !m.equal(key, ikey.UserKey) {
return nil, ErrNotFound
}
switch ikey.Kind() {
case InternalKeyKindDelete, InternalKeyKindSingleDelete:
return nil, ErrNotFound
default:
return val, nil
}
}
// Prepare reserves space for the batch in the memtable and references the
// memtable preventing it from being flushed until the batch is applied. Note
// that prepare is not thread-safe, while apply is. The caller must call
// unref() after the batch has been applied.
func (m *memTable) prepare(batch *Batch) error {
a := m.skl.Arena()
if atomic.LoadInt32(&m.refs) == 1 {
// If there are no other concurrent apply operations, we can update the
// reserved bytes setting to accurately reflect how many bytes of been
// allocated vs the over-estimation present in memTableEntrySize.
m.reserved = a.Size()
}
avail := a.Capacity() - m.reserved
if batch.memTableSize > avail {
return arenaskl.ErrArenaFull
}
m.reserved += batch.memTableSize
m.ref()
return nil
}
func (m *memTable) apply(batch *Batch, seqNum uint64) error {
var ins arenaskl.Inserter
var tombstoneCount uint32
startSeqNum := seqNum
for r := batch.Reader(); ; seqNum++ {
kind, ukey, value, ok := r.Next()
if !ok {
break
}
var err error
ikey := base.MakeInternalKey(ukey, seqNum, kind)
switch kind {
case InternalKeyKindRangeDelete:
err = m.rangeDelSkl.Add(ikey, value)
tombstoneCount++
case InternalKeyKindLogData:
default:
err = ins.Add(&m.skl, ikey, value)
}
if err != nil {
return err
}
}
if seqNum != startSeqNum+uint64(batch.Count()) {
panic(fmt.Sprintf("pebble: inconsistent batch count: %d vs %d",
seqNum, startSeqNum+uint64(batch.Count())))
}
if tombstoneCount != 0 {
m.tombstones.invalidate(tombstoneCount)
}
return nil
}
// newIter returns an iterator that is unpositioned (Iterator.Valid() will
// return false). The iterator can be positioned via a call to SeekGE,
// SeekLT, First or Last.
func (m *memTable) newIter(o *IterOptions) internalIterator {
return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound())
}
func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
return m.skl.NewFlushIter(bytesFlushed)
}
func (m *memTable) newRangeDelIter(*IterOptions) internalIterator {
tombstones := m.tombstones.get(m)
if tombstones == nil {
return nil
}
return rangedel.NewIter(m.cmp, tombstones)
}
func (m *memTable) totalBytes() uint64 {
return uint64(m.skl.Size() - m.emptySize)
}
func (m *memTable) close() error {
return nil
}
// empty returns whether the MemTable has no key/value pairs.
func (m *memTable) empty() bool {
return m.skl.Size() == m.emptySize
}
// A rangeTombstoneFrags holds a set of fragmented range tombstones generated
// at a particular "sequence number" for a memtable. Rather than use actual
// sequence numbers, this cache uses a count of the number of range tombstones
// in the memTable. Note that the count of range tombstones in a memTable only
// ever increases, which provides a monotonically increasing sequence.
type rangeTombstoneFrags struct {
count uint32
once sync.Once
tombstones []rangedel.Tombstone
}
// get retrieves the fragmented tombstones, populating them if necessary. Note
// that the populated tombstone fragments may be built from more than f.count
// memTable range tombstones, but that is ok for correctness. All we're
// requiring is that the memTable contains at least f.count range
// tombstones. This situation can occur if there are multiple concurrent
// additions of range tombstones and a concurrent reader. The reader can load a
// tombstoneFrags and populate it even though is has been invalidated
// (i.e. replaced with a newer tombstoneFrags).
func (f *rangeTombstoneFrags) get(m *memTable) []rangedel.Tombstone {
f.once.Do(func() {
frag := &rangedel.Fragmenter{
Cmp: m.cmp,
Emit: func(fragmented []rangedel.Tombstone) {
f.tombstones = append(f.tombstones, fragmented...)
},
}
it := m.rangeDelSkl.NewIter(nil, nil)
for key, val := it.First(); key != nil; key, val = it.Next() {
frag.Add(*key, val)
}
frag.Finish()
})
return f.tombstones
}
// A rangeTombstoneCache is used to cache a set of fragmented tombstones. The
// cache is invalidated whenever a tombstone is added to a memTable, and
// populated when empty when a range-del iterator is created.
type rangeTombstoneCache struct {
count uint32
frags unsafe.Pointer
}
// Invalidate the current set of cached tombstones, indicating the number of
// tombstones that were added.
func (c *rangeTombstoneCache) invalidate(count uint32) {
newCount := atomic.AddUint32(&c.count, count)
var frags *rangeTombstoneFrags
for {
oldPtr := atomic.LoadPointer(&c.frags)
if oldPtr != nil {
oldFrags := (*rangeTombstoneFrags)(oldPtr)
if oldFrags.count >= newCount {
// Someone else invalidated the cache before us and their invalidation
// subsumes ours.
break
}
}
if frags == nil {
frags = &rangeTombstoneFrags{count: newCount}
}
if atomic.CompareAndSwapPointer(&c.frags, oldPtr, unsafe.Pointer(frags)) {
// We successfully invalidated the cache.
break
}
// Someone else invalidated the cache. Loop and try again.
}
}
func (c *rangeTombstoneCache) get(m *memTable) []rangedel.Tombstone {
frags := (*rangeTombstoneFrags)(atomic.LoadPointer(&c.frags))
if frags == nil {
return nil
}
return frags.get(m)
}