Skip to content

Commit

Permalink
rocksdb: Bump version to 4.9 and fix broken interface usage
Browse files Browse the repository at this point in the history
They changed a couple parts of their interfaces that broke our code.
For now, I've taken the hideous approach of copying the old logic to
avoid modifying the options that we end up using, but will look for a
better solution tomorrow (and have asked their maintainers for a
recommendation as well on https://reviews.facebook.net/D59319).

Rushing this a bit because we think that 4.9 might fix a deadlock in
rocksdb that's regularly affecting us on GCE, and we'd like to verify
that assumption tomorrow morning and get it into this week's beta
release if it helps things.
  • Loading branch information
a-robinson committed Aug 25, 2016
1 parent 514206e commit a5ce7dd
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 4 deletions.
2 changes: 1 addition & 1 deletion GLOCKFILE
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ github.com/chzyer/test bea8f082b6fd8382588bf6fdc6af9217078af151
github.com/client9/misspell dcb75ecbeec8f85e28b639c3a2f5b4c8a7b8a888
github.com/cockroachdb/c-jemalloc 42e6a32cd7a4dff9c70d80323681d46d046181ef
github.com/cockroachdb/c-protobuf 951f3e665896e7ba939fd1f2db9aeaae6ca988f8
github.com/cockroachdb/c-rocksdb ab57b321a358f4c93fb639331ac3e10911649492
github.com/cockroachdb/c-rocksdb ba8e37dfc825bedc3c0512f2c53dfd7cbbf8076d
github.com/cockroachdb/c-snappy d4e7b428fe7fc09e93573df3448567a62df8c9fa
github.com/cockroachdb/cmux b64f5908f4945f4b11ed4a0a9d3cc1e23350866d
github.com/cockroachdb/cockroach-go 2e4a60d41697eebb308b1def89f0abaf1c056137
Expand Down
5 changes: 2 additions & 3 deletions storage/engine/rocksdb/db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "db.h"
#include "encoding.h"
#include "eventlistener.h"
#include "options_builder.h"

#include <iostream>

Expand Down Expand Up @@ -1984,13 +1985,11 @@ DBStatus DBEngineAddFile(DBEngine* db, DBSlice path) {

struct DBSstFileWriter {
std::unique_ptr<rocksdb::Options> options;
rocksdb::ImmutableCFOptions ioptions;
rocksdb::SstFileWriter rep;

DBSstFileWriter(rocksdb::Options* o)
: options(o),
ioptions(*o),
rep(rocksdb::EnvOptions(), ioptions, o->comparator) {
rep(rocksdb::EnvOptions(), *o, o->comparator) {
}
virtual ~DBSstFileWriter() { }
};
Expand Down
214 changes: 214 additions & 0 deletions storage/engine/rocksdb/options_builder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// TODO(a-robinson,pmattis): Clean up or remove this logic. For the sake of
// stability in the very short term, it's just been copied over verbatim
// from the rocksdb code that was removed from their repo in
// https://github.com/facebook/rocksdb/commit/b2973eaaebbaa2b410fd808dc2b1578ca8ab0c07
// I've asked for recommendations in the original code review at
// https://reviews.facebook.net/D59319

#include <math.h>
#include <cmath>
#include <algorithm>
#include "rocksdb/options.h"

namespace rocksdb {

namespace {

// For now, always use 1-0 as level bytes multiplier.
const int kBytesForLevelMultiplier = 10;
const size_t kBytesForOneMb = 1024 * 1024;

// Pick compaction style
CompactionStyle PickCompactionStyle(size_t write_buffer_size,
int read_amp_threshold,
int write_amp_threshold,
uint64_t target_db_size) {
#ifndef ROCKSDB_LITE
// Estimate read amplification and write amplification of two compaction
// styles. If there is hard limit to force a choice, make the choice.
// Otherwise, calculate a score based on threshold and expected value of
// two styles, weighing reads 4X important than writes.
int expected_levels = static_cast<int>(ceil(
std::log(target_db_size / write_buffer_size) / std::log(kBytesForLevelMultiplier)));

int expected_max_files_universal =
static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));

const int kEstimatedLevel0FilesInLevelStyle = 2;
// Estimate write amplification:
// (1) 1 for every L0 file
// (2) 2 for L1
// (3) kBytesForLevelMultiplier for the last level. It's really hard to
// predict.
// (3) kBytesForLevelMultiplier for other levels.
int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2
+ (expected_levels - 2) * kBytesForLevelMultiplier
+ kBytesForLevelMultiplier;
int expected_read_amp_level =
kEstimatedLevel0FilesInLevelStyle + expected_levels;

int max_read_amp_uni = expected_max_files_universal;
if (read_amp_threshold <= max_read_amp_uni) {
return kCompactionStyleLevel;
} else if (write_amp_threshold <= expected_write_amp_level) {
return kCompactionStyleUniversal;
}

const double kReadWriteWeight = 4;

double level_ratio =
static_cast<double>(read_amp_threshold) / expected_read_amp_level *
kReadWriteWeight +
static_cast<double>(write_amp_threshold) / expected_write_amp_level;

int expected_write_amp_uni = expected_max_files_universal / 2 + 2;
int expected_read_amp_uni = expected_max_files_universal / 2 + 1;

double uni_ratio =
static_cast<double>(read_amp_threshold) / expected_read_amp_uni *
kReadWriteWeight +
static_cast<double>(write_amp_threshold) / expected_write_amp_uni;

if (level_ratio > uni_ratio) {
return kCompactionStyleLevel;
} else {
return kCompactionStyleUniversal;
}
#else
return kCompactionStyleLevel;
#endif // !ROCKSDB_LITE
}

// Pick mem table size
void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb;
const size_t kMinWriteBufferSize = 4 * kBytesForOneMb;

// Try to pick up a buffer size between 4MB and 128MB.
// And try to pick 4 as the total number of write buffers.
size_t write_buffer_size = total_write_buffer_limit / 4;
if (write_buffer_size > kMaxWriteBufferSize) {
write_buffer_size = kMaxWriteBufferSize;
} else if (write_buffer_size < kMinWriteBufferSize) {
write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize),
total_write_buffer_limit / 2);
}

// Truncate to multiple of 1MB.
if (write_buffer_size % kBytesForOneMb != 0) {
write_buffer_size =
(write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb;
}

options->write_buffer_size = write_buffer_size;
options->max_write_buffer_number =
static_cast<int>(total_write_buffer_limit / write_buffer_size);
options->min_write_buffer_number_to_merge = 1;
}

#ifndef ROCKSDB_LITE
void OptimizeForUniversal(Options* options) {
options->level0_file_num_compaction_trigger = 2;
options->level0_slowdown_writes_trigger = 30;
options->level0_stop_writes_trigger = 40;
options->max_open_files = -1;
}
#endif

// Optimize parameters for level-based compaction
void OptimizeForLevel(int read_amplification_threshold,
int write_amplification_threshold,
uint64_t target_db_size, Options* options) {
int expected_levels_one_level0_file =
static_cast<int>(ceil(std::log(target_db_size / options->write_buffer_size) /
std::log(kBytesForLevelMultiplier)));

int level0_stop_writes_trigger =
read_amplification_threshold - expected_levels_one_level0_file;

const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb;
const int kMaxFileNumCompactionTrigger = 4;
const int kMinLevel0StopTrigger = 3;

int file_num_buffer = static_cast<int>(
kInitialLevel0TotalSize / options->write_buffer_size + 1);

if (level0_stop_writes_trigger > file_num_buffer) {
// Have sufficient room for multiple level 0 files
// Try enlarge the buffer up to 1GB

// Try to enlarge the buffer up to 1GB, if still have sufficient headroom.
file_num_buffer *=
1 << std::max(0, std::min(3, level0_stop_writes_trigger -
file_num_buffer - 2));

options->level0_stop_writes_trigger = level0_stop_writes_trigger;
options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2;
options->level0_file_num_compaction_trigger =
std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2);
} else {
options->level0_stop_writes_trigger =
std::max(kMinLevel0StopTrigger, file_num_buffer);
options->level0_slowdown_writes_trigger =
options->level0_stop_writes_trigger - 1;
options->level0_file_num_compaction_trigger = 1;
}

// This doesn't consider compaction and overheads of mem tables. But usually
// it is in the same order of magnitude.
size_t expected_level0_compaction_size =
options->level0_file_num_compaction_trigger * options->write_buffer_size;
// Enlarge level1 target file size if level0 compaction size is larger.
uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb;
if (expected_level0_compaction_size > max_bytes_for_level_base) {
max_bytes_for_level_base = expected_level0_compaction_size;
}
options->max_bytes_for_level_base = max_bytes_for_level_base;
// Now always set level multiplier to be 10
options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier;

const uint64_t kMinFileSize = 2 * kBytesForOneMb;
// Allow at least 3-way parallelism for compaction between level 1 and 2.
uint64_t max_file_size = max_bytes_for_level_base / 3;
if (max_file_size < kMinFileSize) {
options->target_file_size_base = kMinFileSize;
} else {
if (max_file_size % kBytesForOneMb != 0) {
max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb;
}
options->target_file_size_base = max_file_size;
}

// TODO: consider to tune num_levels too.
}

} // namespace

Options GetOptions(size_t total_write_buffer_limit,
int read_amplification_threshold,
int write_amplification_threshold, uint64_t target_db_size) {
Options options;
PickWriteBufferSize(total_write_buffer_limit, &options);
size_t write_buffer_size = options.write_buffer_size;
options.compaction_style =
PickCompactionStyle(write_buffer_size, read_amplification_threshold,
write_amplification_threshold, target_db_size);
#ifndef ROCKSDB_LITE
if (options.compaction_style == kCompactionStyleUniversal) {
OptimizeForUniversal(&options);
} else {
#else
{
#endif // !ROCKSDB_LITE
OptimizeForLevel(read_amplification_threshold,
write_amplification_threshold, target_db_size, &options);
}
return options;
}

} // namespace rocksdb
30 changes: 30 additions & 0 deletions storage/engine/rocksdb/options_builder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_
#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_

#include "rocksdb/options.h"

namespace rocksdb {

// Get options based on some guidelines. Now only tune parameter based on
// flush/compaction and fill default parameters for other parameters.
// total_write_buffer_limit: budget for memory spent for mem tables
// read_amplification_threshold: comfortable value of read amplification
// write_amplification_threshold: comfortable value of write amplification.
// target_db_size: estimated total DB size.
extern Options GetOptions(size_t total_write_buffer_limit,
int read_amplification_threshold = 8,
int write_amplification_threshold = 32,
uint64_t target_db_size = 68719476736 /* 64GB */);


} // namespace rocksdb

#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_

0 comments on commit a5ce7dd

Please sign in to comment.