Skip to content

Commit

Permalink
Merge pull request cockroachdb#8815 from a-robinson/rocksdb
Browse files Browse the repository at this point in the history
rocksdb: Bump version to 4.9 and fix broken interface usage
  • Loading branch information
a-robinson authored Aug 25, 2016
2 parents 514206e + a5ce7dd commit 75921d4
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 4 deletions.
2 changes: 1 addition & 1 deletion GLOCKFILE
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ github.com/chzyer/test bea8f082b6fd8382588bf6fdc6af9217078af151
github.com/client9/misspell dcb75ecbeec8f85e28b639c3a2f5b4c8a7b8a888
github.com/cockroachdb/c-jemalloc 42e6a32cd7a4dff9c70d80323681d46d046181ef
github.com/cockroachdb/c-protobuf 951f3e665896e7ba939fd1f2db9aeaae6ca988f8
github.com/cockroachdb/c-rocksdb ab57b321a358f4c93fb639331ac3e10911649492
github.com/cockroachdb/c-rocksdb ba8e37dfc825bedc3c0512f2c53dfd7cbbf8076d
github.com/cockroachdb/c-snappy d4e7b428fe7fc09e93573df3448567a62df8c9fa
github.com/cockroachdb/cmux b64f5908f4945f4b11ed4a0a9d3cc1e23350866d
github.com/cockroachdb/cockroach-go 2e4a60d41697eebb308b1def89f0abaf1c056137
Expand Down
5 changes: 2 additions & 3 deletions storage/engine/rocksdb/db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "db.h"
#include "encoding.h"
#include "eventlistener.h"
#include "options_builder.h"

#include <iostream>

Expand Down Expand Up @@ -1984,13 +1985,11 @@ DBStatus DBEngineAddFile(DBEngine* db, DBSlice path) {

struct DBSstFileWriter {
std::unique_ptr<rocksdb::Options> options;
rocksdb::ImmutableCFOptions ioptions;
rocksdb::SstFileWriter rep;

DBSstFileWriter(rocksdb::Options* o)
: options(o),
ioptions(*o),
rep(rocksdb::EnvOptions(), ioptions, o->comparator) {
rep(rocksdb::EnvOptions(), *o, o->comparator) {
}
virtual ~DBSstFileWriter() { }
};
Expand Down
214 changes: 214 additions & 0 deletions storage/engine/rocksdb/options_builder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// TODO(a-robinson,pmattis): Clean up or remove this logic. For the sake of
// stability in the very short term, it's just been copied over verbatim
// from the rocksdb code that was removed from their repo in
// https://github.com/facebook/rocksdb/commit/b2973eaaebbaa2b410fd808dc2b1578ca8ab0c07
// I've asked for recommendations in the original code review at
// https://reviews.facebook.net/D59319

#include <math.h>
#include <cmath>
#include <algorithm>
#include "rocksdb/options.h"

namespace rocksdb {

namespace {

// For now, always use 1-0 as level bytes multiplier.
const int kBytesForLevelMultiplier = 10;
const size_t kBytesForOneMb = 1024 * 1024;

// Pick compaction style
CompactionStyle PickCompactionStyle(size_t write_buffer_size,
int read_amp_threshold,
int write_amp_threshold,
uint64_t target_db_size) {
#ifndef ROCKSDB_LITE
// Estimate read amplification and write amplification of two compaction
// styles. If there is hard limit to force a choice, make the choice.
// Otherwise, calculate a score based on threshold and expected value of
// two styles, weighing reads 4X important than writes.
int expected_levels = static_cast<int>(ceil(
std::log(target_db_size / write_buffer_size) / std::log(kBytesForLevelMultiplier)));

int expected_max_files_universal =
static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));

const int kEstimatedLevel0FilesInLevelStyle = 2;
// Estimate write amplification:
// (1) 1 for every L0 file
// (2) 2 for L1
// (3) kBytesForLevelMultiplier for the last level. It's really hard to
// predict.
// (3) kBytesForLevelMultiplier for other levels.
int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2
+ (expected_levels - 2) * kBytesForLevelMultiplier
+ kBytesForLevelMultiplier;
int expected_read_amp_level =
kEstimatedLevel0FilesInLevelStyle + expected_levels;

int max_read_amp_uni = expected_max_files_universal;
if (read_amp_threshold <= max_read_amp_uni) {
return kCompactionStyleLevel;
} else if (write_amp_threshold <= expected_write_amp_level) {
return kCompactionStyleUniversal;
}

const double kReadWriteWeight = 4;

double level_ratio =
static_cast<double>(read_amp_threshold) / expected_read_amp_level *
kReadWriteWeight +
static_cast<double>(write_amp_threshold) / expected_write_amp_level;

int expected_write_amp_uni = expected_max_files_universal / 2 + 2;
int expected_read_amp_uni = expected_max_files_universal / 2 + 1;

double uni_ratio =
static_cast<double>(read_amp_threshold) / expected_read_amp_uni *
kReadWriteWeight +
static_cast<double>(write_amp_threshold) / expected_write_amp_uni;

if (level_ratio > uni_ratio) {
return kCompactionStyleLevel;
} else {
return kCompactionStyleUniversal;
}
#else
return kCompactionStyleLevel;
#endif // !ROCKSDB_LITE
}

// Pick mem table size
void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb;
const size_t kMinWriteBufferSize = 4 * kBytesForOneMb;

// Try to pick up a buffer size between 4MB and 128MB.
// And try to pick 4 as the total number of write buffers.
size_t write_buffer_size = total_write_buffer_limit / 4;
if (write_buffer_size > kMaxWriteBufferSize) {
write_buffer_size = kMaxWriteBufferSize;
} else if (write_buffer_size < kMinWriteBufferSize) {
write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize),
total_write_buffer_limit / 2);
}

// Truncate to multiple of 1MB.
if (write_buffer_size % kBytesForOneMb != 0) {
write_buffer_size =
(write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb;
}

options->write_buffer_size = write_buffer_size;
options->max_write_buffer_number =
static_cast<int>(total_write_buffer_limit / write_buffer_size);
options->min_write_buffer_number_to_merge = 1;
}

#ifndef ROCKSDB_LITE
void OptimizeForUniversal(Options* options) {
options->level0_file_num_compaction_trigger = 2;
options->level0_slowdown_writes_trigger = 30;
options->level0_stop_writes_trigger = 40;
options->max_open_files = -1;
}
#endif

// Optimize parameters for level-based compaction
void OptimizeForLevel(int read_amplification_threshold,
int write_amplification_threshold,
uint64_t target_db_size, Options* options) {
int expected_levels_one_level0_file =
static_cast<int>(ceil(std::log(target_db_size / options->write_buffer_size) /
std::log(kBytesForLevelMultiplier)));

int level0_stop_writes_trigger =
read_amplification_threshold - expected_levels_one_level0_file;

const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb;
const int kMaxFileNumCompactionTrigger = 4;
const int kMinLevel0StopTrigger = 3;

int file_num_buffer = static_cast<int>(
kInitialLevel0TotalSize / options->write_buffer_size + 1);

if (level0_stop_writes_trigger > file_num_buffer) {
// Have sufficient room for multiple level 0 files
// Try enlarge the buffer up to 1GB

// Try to enlarge the buffer up to 1GB, if still have sufficient headroom.
file_num_buffer *=
1 << std::max(0, std::min(3, level0_stop_writes_trigger -
file_num_buffer - 2));

options->level0_stop_writes_trigger = level0_stop_writes_trigger;
options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2;
options->level0_file_num_compaction_trigger =
std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2);
} else {
options->level0_stop_writes_trigger =
std::max(kMinLevel0StopTrigger, file_num_buffer);
options->level0_slowdown_writes_trigger =
options->level0_stop_writes_trigger - 1;
options->level0_file_num_compaction_trigger = 1;
}

// This doesn't consider compaction and overheads of mem tables. But usually
// it is in the same order of magnitude.
size_t expected_level0_compaction_size =
options->level0_file_num_compaction_trigger * options->write_buffer_size;
// Enlarge level1 target file size if level0 compaction size is larger.
uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb;
if (expected_level0_compaction_size > max_bytes_for_level_base) {
max_bytes_for_level_base = expected_level0_compaction_size;
}
options->max_bytes_for_level_base = max_bytes_for_level_base;
// Now always set level multiplier to be 10
options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier;

const uint64_t kMinFileSize = 2 * kBytesForOneMb;
// Allow at least 3-way parallelism for compaction between level 1 and 2.
uint64_t max_file_size = max_bytes_for_level_base / 3;
if (max_file_size < kMinFileSize) {
options->target_file_size_base = kMinFileSize;
} else {
if (max_file_size % kBytesForOneMb != 0) {
max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb;
}
options->target_file_size_base = max_file_size;
}

// TODO: consider to tune num_levels too.
}

} // namespace

Options GetOptions(size_t total_write_buffer_limit,
int read_amplification_threshold,
int write_amplification_threshold, uint64_t target_db_size) {
Options options;
PickWriteBufferSize(total_write_buffer_limit, &options);
size_t write_buffer_size = options.write_buffer_size;
options.compaction_style =
PickCompactionStyle(write_buffer_size, read_amplification_threshold,
write_amplification_threshold, target_db_size);
#ifndef ROCKSDB_LITE
if (options.compaction_style == kCompactionStyleUniversal) {
OptimizeForUniversal(&options);
} else {
#else
{
#endif // !ROCKSDB_LITE
OptimizeForLevel(read_amplification_threshold,
write_amplification_threshold, target_db_size, &options);
}
return options;
}

} // namespace rocksdb
30 changes: 30 additions & 0 deletions storage/engine/rocksdb/options_builder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_
#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_

#include "rocksdb/options.h"

namespace rocksdb {

// Get options based on some guidelines. Now only tune parameter based on
// flush/compaction and fill default parameters for other parameters.
// total_write_buffer_limit: budget for memory spent for mem tables
// read_amplification_threshold: comfortable value of read amplification
// write_amplification_threshold: comfortable value of write amplification.
// target_db_size: estimated total DB size.
extern Options GetOptions(size_t total_write_buffer_limit,
int read_amplification_threshold = 8,
int write_amplification_threshold = 32,
uint64_t target_db_size = 68719476736 /* 64GB */);


} // namespace rocksdb

#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_

0 comments on commit 75921d4

Please sign in to comment.