diff --git a/GLOCKFILE b/GLOCKFILE index 034c2b47282c..11e49d5a17d0 100644 --- a/GLOCKFILE +++ b/GLOCKFILE @@ -35,7 +35,7 @@ github.com/chzyer/test bea8f082b6fd8382588bf6fdc6af9217078af151 github.com/client9/misspell dcb75ecbeec8f85e28b639c3a2f5b4c8a7b8a888 github.com/cockroachdb/c-jemalloc 42e6a32cd7a4dff9c70d80323681d46d046181ef github.com/cockroachdb/c-protobuf 951f3e665896e7ba939fd1f2db9aeaae6ca988f8 -github.com/cockroachdb/c-rocksdb ab57b321a358f4c93fb639331ac3e10911649492 +github.com/cockroachdb/c-rocksdb ba8e37dfc825bedc3c0512f2c53dfd7cbbf8076d github.com/cockroachdb/c-snappy d4e7b428fe7fc09e93573df3448567a62df8c9fa github.com/cockroachdb/cmux b64f5908f4945f4b11ed4a0a9d3cc1e23350866d github.com/cockroachdb/cockroach-go 2e4a60d41697eebb308b1def89f0abaf1c056137 diff --git a/storage/engine/rocksdb/db.cc b/storage/engine/rocksdb/db.cc index 47522d655bff..869d58961d85 100644 --- a/storage/engine/rocksdb/db.cc +++ b/storage/engine/rocksdb/db.cc @@ -39,6 +39,7 @@ #include "db.h" #include "encoding.h" #include "eventlistener.h" +#include "options_builder.h" #include @@ -1984,13 +1985,11 @@ DBStatus DBEngineAddFile(DBEngine* db, DBSlice path) { struct DBSstFileWriter { std::unique_ptr options; - rocksdb::ImmutableCFOptions ioptions; rocksdb::SstFileWriter rep; DBSstFileWriter(rocksdb::Options* o) : options(o), - ioptions(*o), - rep(rocksdb::EnvOptions(), ioptions, o->comparator) { + rep(rocksdb::EnvOptions(), *o, o->comparator) { } virtual ~DBSstFileWriter() { } }; diff --git a/storage/engine/rocksdb/options_builder.cc b/storage/engine/rocksdb/options_builder.cc new file mode 100644 index 000000000000..141cca51e14f --- /dev/null +++ b/storage/engine/rocksdb/options_builder.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// TODO(a-robinson,pmattis): Clean up or remove this logic. For the sake of +// stability in the very short term, it's just been copied over verbatim +// from the rocksdb code that was removed from their repo in +// https://github.com/facebook/rocksdb/commit/b2973eaaebbaa2b410fd808dc2b1578ca8ab0c07 +// I've asked for recommendations in the original code review at +// https://reviews.facebook.net/D59319 + +#include +#include +#include +#include "rocksdb/options.h" + +namespace rocksdb { + +namespace { + +// For now, always use 1-0 as level bytes multiplier. +const int kBytesForLevelMultiplier = 10; +const size_t kBytesForOneMb = 1024 * 1024; + +// Pick compaction style +CompactionStyle PickCompactionStyle(size_t write_buffer_size, + int read_amp_threshold, + int write_amp_threshold, + uint64_t target_db_size) { +#ifndef ROCKSDB_LITE + // Estimate read amplification and write amplification of two compaction + // styles. If there is hard limit to force a choice, make the choice. + // Otherwise, calculate a score based on threshold and expected value of + // two styles, weighing reads 4X important than writes. + int expected_levels = static_cast(ceil( + std::log(target_db_size / write_buffer_size) / std::log(kBytesForLevelMultiplier))); + + int expected_max_files_universal = + static_cast(ceil(log2(target_db_size / write_buffer_size))); + + const int kEstimatedLevel0FilesInLevelStyle = 2; + // Estimate write amplification: + // (1) 1 for every L0 file + // (2) 2 for L1 + // (3) kBytesForLevelMultiplier for the last level. It's really hard to + // predict. + // (3) kBytesForLevelMultiplier for other levels. + int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2 + + (expected_levels - 2) * kBytesForLevelMultiplier + + kBytesForLevelMultiplier; + int expected_read_amp_level = + kEstimatedLevel0FilesInLevelStyle + expected_levels; + + int max_read_amp_uni = expected_max_files_universal; + if (read_amp_threshold <= max_read_amp_uni) { + return kCompactionStyleLevel; + } else if (write_amp_threshold <= expected_write_amp_level) { + return kCompactionStyleUniversal; + } + + const double kReadWriteWeight = 4; + + double level_ratio = + static_cast(read_amp_threshold) / expected_read_amp_level * + kReadWriteWeight + + static_cast(write_amp_threshold) / expected_write_amp_level; + + int expected_write_amp_uni = expected_max_files_universal / 2 + 2; + int expected_read_amp_uni = expected_max_files_universal / 2 + 1; + + double uni_ratio = + static_cast(read_amp_threshold) / expected_read_amp_uni * + kReadWriteWeight + + static_cast(write_amp_threshold) / expected_write_amp_uni; + + if (level_ratio > uni_ratio) { + return kCompactionStyleLevel; + } else { + return kCompactionStyleUniversal; + } +#else + return kCompactionStyleLevel; +#endif // !ROCKSDB_LITE +} + +// Pick mem table size +void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) { + const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb; + const size_t kMinWriteBufferSize = 4 * kBytesForOneMb; + + // Try to pick up a buffer size between 4MB and 128MB. + // And try to pick 4 as the total number of write buffers. + size_t write_buffer_size = total_write_buffer_limit / 4; + if (write_buffer_size > kMaxWriteBufferSize) { + write_buffer_size = kMaxWriteBufferSize; + } else if (write_buffer_size < kMinWriteBufferSize) { + write_buffer_size = std::min(static_cast(kMinWriteBufferSize), + total_write_buffer_limit / 2); + } + + // Truncate to multiple of 1MB. + if (write_buffer_size % kBytesForOneMb != 0) { + write_buffer_size = + (write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb; + } + + options->write_buffer_size = write_buffer_size; + options->max_write_buffer_number = + static_cast(total_write_buffer_limit / write_buffer_size); + options->min_write_buffer_number_to_merge = 1; +} + +#ifndef ROCKSDB_LITE +void OptimizeForUniversal(Options* options) { + options->level0_file_num_compaction_trigger = 2; + options->level0_slowdown_writes_trigger = 30; + options->level0_stop_writes_trigger = 40; + options->max_open_files = -1; +} +#endif + +// Optimize parameters for level-based compaction +void OptimizeForLevel(int read_amplification_threshold, + int write_amplification_threshold, + uint64_t target_db_size, Options* options) { + int expected_levels_one_level0_file = + static_cast(ceil(std::log(target_db_size / options->write_buffer_size) / + std::log(kBytesForLevelMultiplier))); + + int level0_stop_writes_trigger = + read_amplification_threshold - expected_levels_one_level0_file; + + const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb; + const int kMaxFileNumCompactionTrigger = 4; + const int kMinLevel0StopTrigger = 3; + + int file_num_buffer = static_cast( + kInitialLevel0TotalSize / options->write_buffer_size + 1); + + if (level0_stop_writes_trigger > file_num_buffer) { + // Have sufficient room for multiple level 0 files + // Try enlarge the buffer up to 1GB + + // Try to enlarge the buffer up to 1GB, if still have sufficient headroom. + file_num_buffer *= + 1 << std::max(0, std::min(3, level0_stop_writes_trigger - + file_num_buffer - 2)); + + options->level0_stop_writes_trigger = level0_stop_writes_trigger; + options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2; + options->level0_file_num_compaction_trigger = + std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2); + } else { + options->level0_stop_writes_trigger = + std::max(kMinLevel0StopTrigger, file_num_buffer); + options->level0_slowdown_writes_trigger = + options->level0_stop_writes_trigger - 1; + options->level0_file_num_compaction_trigger = 1; + } + + // This doesn't consider compaction and overheads of mem tables. But usually + // it is in the same order of magnitude. + size_t expected_level0_compaction_size = + options->level0_file_num_compaction_trigger * options->write_buffer_size; + // Enlarge level1 target file size if level0 compaction size is larger. + uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb; + if (expected_level0_compaction_size > max_bytes_for_level_base) { + max_bytes_for_level_base = expected_level0_compaction_size; + } + options->max_bytes_for_level_base = max_bytes_for_level_base; + // Now always set level multiplier to be 10 + options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier; + + const uint64_t kMinFileSize = 2 * kBytesForOneMb; + // Allow at least 3-way parallelism for compaction between level 1 and 2. + uint64_t max_file_size = max_bytes_for_level_base / 3; + if (max_file_size < kMinFileSize) { + options->target_file_size_base = kMinFileSize; + } else { + if (max_file_size % kBytesForOneMb != 0) { + max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb; + } + options->target_file_size_base = max_file_size; + } + + // TODO: consider to tune num_levels too. +} + +} // namespace + +Options GetOptions(size_t total_write_buffer_limit, + int read_amplification_threshold, + int write_amplification_threshold, uint64_t target_db_size) { + Options options; + PickWriteBufferSize(total_write_buffer_limit, &options); + size_t write_buffer_size = options.write_buffer_size; + options.compaction_style = + PickCompactionStyle(write_buffer_size, read_amplification_threshold, + write_amplification_threshold, target_db_size); +#ifndef ROCKSDB_LITE + if (options.compaction_style == kCompactionStyleUniversal) { + OptimizeForUniversal(&options); + } else { +#else + { +#endif // !ROCKSDB_LITE + OptimizeForLevel(read_amplification_threshold, + write_amplification_threshold, target_db_size, &options); + } + return options; +} + +} // namespace rocksdb diff --git a/storage/engine/rocksdb/options_builder.h b/storage/engine/rocksdb/options_builder.h new file mode 100644 index 000000000000..5df6eaa55373 --- /dev/null +++ b/storage/engine/rocksdb/options_builder.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_ +#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_ + +#include "rocksdb/options.h" + +namespace rocksdb { + +// Get options based on some guidelines. Now only tune parameter based on +// flush/compaction and fill default parameters for other parameters. +// total_write_buffer_limit: budget for memory spent for mem tables +// read_amplification_threshold: comfortable value of read amplification +// write_amplification_threshold: comfortable value of write amplification. +// target_db_size: estimated total DB size. +extern Options GetOptions(size_t total_write_buffer_limit, + int read_amplification_threshold = 8, + int write_amplification_threshold = 32, + uint64_t target_db_size = 68719476736 /* 64GB */); + + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_BUILDER_H_