-
Notifications
You must be signed in to change notification settings - Fork 471
/
table.go
428 lines (382 loc) · 14.6 KB
/
table.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
// Package sstable implements readers and writers of pebble tables.
//
// Tables are either opened for reading or created for writing but not both.
//
// A reader can create iterators, which allow seeking and next/prev
// iteration. There may be multiple key/value pairs that have the same key and
// different sequence numbers.
//
// A reader can be used concurrently. Multiple goroutines can call NewIter
// concurrently, and each iterator can run concurrently with other iterators.
// However, any particular iterator should not be used concurrently, and iterators
// should not be used once a reader is closed.
//
// A writer writes key/value pairs in increasing key order, and cannot be used
// concurrently. A table cannot be read until the writer has finished.
//
// Readers and writers can be created with various options. Passing a nil
// Options pointer is valid and means to use the default values.
//
// One such option is to define the 'less than' ordering for keys. The default
// Comparer uses the natural ordering consistent with bytes.Compare. The same
// ordering should be used for reading and writing a table.
//
// To return the value for a key:
//
// r := table.NewReader(file, options)
// defer r.Close()
// i := r.NewIter(nil, nil)
// defer i.Close()
// ikey, value := r.SeekGE(key)
// if options.Comparer.Compare(ikey.UserKey, key) != 0 {
// // not found
// } else {
// // value is the first record containing key
// }
//
// To count the number of entries in a table:
//
// i, n := r.NewIter(nil, nil), 0
// for key, value := i.First(); key != nil; key, value = i.Next() {
// n++
// }
// if err := i.Close(); err != nil {
// return 0, err
// }
// return n, nil
//
// To write a table with three entries:
//
// w := table.NewWriter(file, options)
// if err := w.Set([]byte("apple"), []byte("red")); err != nil {
// w.Close()
// return err
// }
// if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
// w.Close()
// return err
// }
// if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
// w.Close()
// return err
// }
// return w.Close()
package sstable // import "github.com/cockroachdb/pebble/sstable"
import (
"encoding/binary"
"io"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
)
/*
The table file format looks like:
<start_of_file>
[data block 0]
[data block 1]
...
[data block N-1]
[meta filter block] (optional)
[index block] (for single level index)
[meta rangedel block] (optional)
[meta range key block] (optional)
[meta properties block]
[metaindex block]
[footer]
<end_of_file>
A Reader eagerly loads the footer, metaindex block and meta properties block,
because the data contained in those blocks is needed on every read, and even
before reading. For example, the meta properties block is used to verify the
comparer and merger are compatible, and the metaindex block contains the
location of the meta properties (and other meta blocks). In situations where
file system locality matters, or one wants to minimize number of read
requests when eagerly loading these blocks, having these three as a suffix
of the file is convenient.
The interleaving of the index block(s) between the meta blocks is done to
match RocksDB/LevelDB behavior.
Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
4 byte checksum. The checksum is computed over the compressed data and the first
byte of the trailer (i.e. the block type), and is serialized as little-endian.
The block type gives the per-block compression used; each block is compressed
independently. The checksum algorithm is described in the pebble/crc package.
Most blocks, other than the meta filter block, contain key/value pairs. The
remainder of this comment refers to the decompressed block, which has its 5 byte
trailer stripped. The decompressed block data consists of a sequence of such
key/value entries followed by a block suffix. Each key is encoded as a shared
prefix length and a remainder string. For example, if two adjacent keys are
"tweedledee" and "tweedledum", then the second key would be encoded as {8,
"um"}. The shared prefix length is varint encoded. The remainder string and the
value are encoded as a varint-encoded length followed by the literal contents.
To continue the example, suppose that the key "tweedledum" mapped to the value
"socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
Every block has a restart interval I. Every I'th key/value entry in that block
is called a restart point, and shares no key prefix with the previous entry.
Continuing the example above, if the key after "tweedledum" was "two", but was
part of a restart point, then that key would be encoded as {0, "two"} instead
of {2, "o"}. If a block has P restart points, then the block suffix consists
of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
uint32 values are the block offsets of each restart point. The final uint32
value is P itself. Thus, when seeking for a particular key, one can use binary
search to find the largest restart point whose key is <= the key sought.
An index block is a block with N key/value entries. The i'th value is the
encoded block handle of the i'th data block. The i'th key is a separator for
i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
is a key that is >= every key in block i and is < every key i block i+1. The
successor for the final block is a key that is >= every key in block N-1. The
index block restart interval is 1: every entry is a restart point.
A block handle is an offset, a length, and optional block properties (for data
blocks and first/lower level index blocks); the length does not include the 5
byte trailer. All numbers are varint-encoded, with no padding between the two
values. The maximum size of an encoded block handle without properties is 20
bytes. It is not advised to have properties that accumulate to be longer than
100 bytes.
Instead of a single index block, the sstable can have a two-level index (this
is used to prevent a single huge index block). A two-level index consists of a
sequence of lower-level index blocks with block handles for data blocks
followed by a single top-level index block with block handles for the
lower-level index blocks.
The metaindex block also contains block handles as values, with keys being
the names of the meta blocks.
*/
const (
blockTrailerLen = 5
blockHandleMaxLenWithoutProperties = 10 + 10
// blockHandleLikelyMaxLen can be used for pre-allocating buffers to
// reduce memory copies. It is not guaranteed that a block handle will not
// exceed this length.
blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
levelDBFooterLen = 48
levelDBMagic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
rocksDBFooterLen = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
rocksDBMagic = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
rocksDBMagicOffset = rocksDBFooterLen - len(rocksDBMagic)
rocksDBVersionOffset = rocksDBMagicOffset - 4
rocksDBExternalFormatVersion = 2
pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
minFooterLen = levelDBFooterLen
maxFooterLen = rocksDBFooterLen
levelDBFormatVersion = 0
rocksDBFormatVersion2 = 2
metaRangeKeyName = "pebble.range_key"
metaPropertiesName = "rocksdb.properties"
metaRangeDelName = "rocksdb.range_del"
metaRangeDelV2Name = "rocksdb.range_del2"
// Index Types.
// A space efficient index block that is optimized for binary-search-based
// index.
binarySearchIndex = 0
// hashSearchIndex = 1
// A two-level index implementation. Both levels are binary search indexes.
twoLevelIndex = 2
// binarySearchWithFirstKeyIndex = 3
// RocksDB always includes this in the properties block. Since Pebble
// doesn't use zstd compression, the string will always be the same.
// This should be removed if we ever decide to diverge from the RocksDB
// properties block.
rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
)
// ChecksumType specifies the checksum used for blocks.
type ChecksumType byte
// The available checksum types.
const (
ChecksumTypeNone ChecksumType = 0
ChecksumTypeCRC32c ChecksumType = 1
ChecksumTypeXXHash ChecksumType = 2
ChecksumTypeXXHash64 ChecksumType = 3
)
// String implements fmt.Stringer.
func (t ChecksumType) String() string {
switch t {
case ChecksumTypeCRC32c:
return "crc32c"
case ChecksumTypeNone:
return "none"
case ChecksumTypeXXHash:
return "xxhash"
case ChecksumTypeXXHash64:
return "xxhash64"
default:
panic(errors.Newf("sstable: unknown checksum type: %d", t))
}
}
type blockType byte
const (
// The block type gives the per-block compression format.
// These constants are part of the file format and should not be changed.
// They are different from the Compression constants because the latter
// are designed so that the zero value of the Compression type means to
// use the default compression (which is snappy).
// Not all compression types listed here are supported.
noCompressionBlockType blockType = 0
snappyCompressionBlockType blockType = 1
zlibCompressionBlockType blockType = 2
bzip2CompressionBlockType blockType = 3
lz4CompressionBlockType blockType = 4
lz4hcCompressionBlockType blockType = 5
xpressCompressionBlockType blockType = 6
zstdCompressionBlockType blockType = 7
)
// String implements fmt.Stringer.
func (t blockType) String() string {
switch t {
case 0:
return "none"
case 1:
return "snappy"
case 2:
return "zlib"
case 3:
return "bzip2"
case 4:
return "lz4"
case 5:
return "lz4hc"
case 6:
return "xpress"
case 7:
return "zstd"
default:
panic(errors.Newf("sstable: unknown block type: %d", t))
}
}
// legacy (LevelDB) footer format:
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
// table_magic_number (8 bytes)
// new (RocksDB) footer format:
// checksum type (char, 1 byte)
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
// footer version (4 bytes)
// table_magic_number (8 bytes)
type footer struct {
format TableFormat
checksum ChecksumType
metaindexBH BlockHandle
indexBH BlockHandle
footerBH BlockHandle
}
func readFooter(f ReadableFile) (footer, error) {
var footer footer
stat, err := f.Stat()
if err != nil {
return footer, errors.Wrap(err, "pebble/table: invalid table (could not stat file)")
}
if stat.Size() < minFooterLen {
return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
}
buf := make([]byte, maxFooterLen)
off := stat.Size() - maxFooterLen
if off < 0 {
off = 0
}
n, err := f.ReadAt(buf, off)
if err != nil && err != io.EOF {
return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
}
buf = buf[:n]
switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
case levelDBMagic:
if len(buf) < levelDBFooterLen {
return footer, base.CorruptionErrorf(
"pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
}
footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
buf = buf[len(buf)-levelDBFooterLen:]
footer.footerBH.Length = uint64(len(buf))
footer.format = TableFormatLevelDB
footer.checksum = ChecksumTypeCRC32c
case rocksDBMagic, pebbleDBMagic:
// NOTE: The Pebble magic string implies the same footer format as that used
// by the RocksDBv2 table format.
if len(buf) < rocksDBFooterLen {
return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
}
footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
buf = buf[len(buf)-rocksDBFooterLen:]
footer.footerBH.Length = uint64(len(buf))
version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
format, err := ParseTableFormat(magic, version)
if err != nil {
return footer, err
}
footer.format = format
switch ChecksumType(buf[0]) {
case ChecksumTypeCRC32c:
footer.checksum = ChecksumTypeCRC32c
case ChecksumTypeXXHash64:
footer.checksum = ChecksumTypeXXHash64
default:
return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
}
buf = buf[1:]
default:
return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number)")
}
{
end := uint64(stat.Size())
var n int
footer.metaindexBH, n = decodeBlockHandle(buf)
if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
}
buf = buf[n:]
footer.indexBH, n = decodeBlockHandle(buf)
if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
}
}
return footer, nil
}
func (f footer) encode(buf []byte) []byte {
switch magic, version := f.format.AsTuple(); magic {
case levelDBMagic:
buf = buf[:levelDBFooterLen]
for i := range buf {
buf[i] = 0
}
n := encodeBlockHandle(buf[0:], f.metaindexBH)
encodeBlockHandle(buf[n:], f.indexBH)
copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
case rocksDBMagic, pebbleDBMagic:
buf = buf[:rocksDBFooterLen]
for i := range buf {
buf[i] = 0
}
switch f.checksum {
case ChecksumTypeNone:
buf[0] = byte(ChecksumTypeNone)
case ChecksumTypeCRC32c:
buf[0] = byte(ChecksumTypeCRC32c)
case ChecksumTypeXXHash:
buf[0] = byte(ChecksumTypeXXHash)
case ChecksumTypeXXHash64:
buf[0] = byte(ChecksumTypeXXHash64)
default:
panic("unknown checksum type")
}
n := 1
n += encodeBlockHandle(buf[n:], f.metaindexBH)
encodeBlockHandle(buf[n:], f.indexBH)
binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
copy(buf[len(buf)-len(rocksDBMagic):], magic)
default:
panic("sstable: unspecified table format version")
}
return buf
}
func supportsTwoLevelIndex(format TableFormat) bool {
switch format {
case TableFormatLevelDB:
return false
case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2:
return true
default:
panic("sstable: unspecified table format version")
}
}