From bc43e5e94322da2e82351f2bc552e642011d19aa Mon Sep 17 00:00:00 2001 From: Jermy Li Date: Tue, 7 Jun 2022 11:21:32 +0800 Subject: [PATCH] bump up rocksdb version to 6.22 (#1742) * bump up rocksdb version to 6.22 * comment siged delete-range key test due to rocksdb issue #8239 * bump to rocksdb 7.2.2 * add memtable_whole_key_filtering option * add index_type option Change-Id: I357f0c2ced0c63d1e357b52f7397fb936255b411 --- hugegraph-rocksdb/pom.xml | 2 +- .../backend/store/rocksdb/RocksDBOptions.java | 265 +++++++++++++++++- .../store/rocksdb/RocksDBStdSessions.java | 176 +++++++++--- .../unit/rocksdb/RocksDBSessionTest.java | 12 +- 4 files changed, 404 insertions(+), 51 deletions(-) diff --git a/hugegraph-rocksdb/pom.xml b/hugegraph-rocksdb/pom.xml index 207dd7c34d..dbcfc66c07 100644 --- a/hugegraph-rocksdb/pom.xml +++ b/hugegraph-rocksdb/pom.xml @@ -20,7 +20,7 @@ org.rocksdb rocksdbjni - 6.10.2 + 7.2.2 diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java index f0a4df24e8..0bd4667ff2 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBOptions.java @@ -27,6 +27,8 @@ import org.rocksdb.CompactionStyle; import org.rocksdb.CompressionType; +import org.rocksdb.DataBlockIndexType; +import org.rocksdb.IndexType; import com.baidu.hugegraph.config.ConfigConvOption; import com.baidu.hugegraph.config.ConfigListConvOption; @@ -217,6 +219,15 @@ public static synchronized RocksDBOptions instance() { false ); + public static final ConfigOption SKIP_CHECK_SIZE_ON_DB_OPEN = + new ConfigOption<>( + "rocksdb.skip_check_sst_size_on_db_open", + "Whether to skip checking sizes of all sst files when " + + "opening the database.", + disallowEmpty(), + false + ); + public static final ConfigOption MAX_FILE_OPENING_THREADS = new ConfigOption<>( "rocksdb.max_file_opening_threads", @@ -235,6 +246,39 @@ public static synchronized RocksDBOptions instance() { 0L ); + public static final ConfigOption BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.bytes_per_sync", + "Allows OS to incrementally sync SST files to disk while " + + "they are being written, asynchronously in the background. " + + "Issue one request for every bytes_per_sync written. " + + "0 turns it off.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption WAL_BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.wal_bytes_per_sync", + "Allows OS to incrementally sync WAL files to disk while " + + "they are being written, asynchronously in the background. " + + "Issue one request for every bytes_per_sync written. " + + "0 turns it off.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption STRICT_BYTES_PER_SYNC = + new ConfigOption<>( + "rocksdb.strict_bytes_per_sync", + "When true, guarantees SST/WAL files have at most " + + "bytes_per_sync/wal_bytes_per_sync bytes submitted for " + + "writeback at any given time. This can be used to handle " + + "cases where processing speed exceeds I/O speed.", + disallowEmpty(), + false + ); + public static final ConfigOption DB_MEMTABLE_SIZE = new ConfigOption<>( "rocksdb.db_write_buffer_size", @@ -244,6 +288,35 @@ public static synchronized RocksDBOptions instance() { 0L ); + public static final ConfigOption LOG_READAHEAD_SIZE = + new ConfigOption<>( + "rocksdb.log_readahead_size", + "The number of bytes to prefetch when reading the log. " + + "0 means the prefetching is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption COMPACTION_READAHEAD_SIZE = + new ConfigOption<>( + "rocksdb.compaction_readahead_size", + "The number of bytes to perform bigger reads when doing " + + "compaction. If running RocksDB on spinning disks, " + + "you should set this to at least 2MB. " + + "0 means the prefetching is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption ROW_CACHE_CAPACITY = + new ConfigOption<>( + "rocksdb.row_cache_capacity", + "The capacity in bytes of global cache for table-level rows. " + + "0 means the row_cache is disabled.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + public static final ConfigOption DELETE_OBSOLETE_FILE_PERIOD = new ConfigOption<>( "rocksdb.delete_obsolete_files_period", @@ -280,11 +353,52 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption MAX_MEMTABLES_TO_MAINTAIN = new ConfigOption<>( "rocksdb.max_write_buffer_number_to_maintain", - "The total maximum number of write buffers to maintain in memory.", + "The total maximum number of write buffers to maintain in memory " + + "for conflict checking when transactions are used.", rangeInt(0, Integer.MAX_VALUE), 0 ); + public static final ConfigOption MEMTABLE_BLOOM_SIZE_RATIO = + new ConfigOption<>( + "rocksdb.memtable_bloom_size_ratio", + "If prefix-extractor is set and memtable_bloom_size_ratio " + + "is not 0, or if memtable_whole_key_filtering is set true, " + + "create bloom filter for memtable with the size of " + + "write_buffer_size * memtable_bloom_size_ratio. " + + "If it is larger than 0.25, it is santinized to 0.25.", + rangeDouble(0.0, 1.0), + 0.0 + ); + + public static final ConfigOption MEMTABLE_BLOOM_WHOLE_KEY_FILTERING = + new ConfigOption<>( + "rocksdb.memtable_whole_key_filtering", + "Enable whole key bloom filter in memtable, it can " + + "potentially reduce CPU usage for point-look-ups. Note " + + "this will only take effect if memtable_bloom_size_ratio > 0.", + disallowEmpty(), + false + ); + + public static final ConfigOption MEMTABL_BLOOM_HUGE_PAGE_SIZE = + new ConfigOption<>( + "rocksdb.memtable_huge_page_size", + "The page size for huge page TLB for bloom in memtable. " + + "If <= 0, not allocate from huge page TLB but from malloc.", + rangeInt(0L, Long.MAX_VALUE), + 0L + ); + + public static final ConfigOption MEMTABLE_INPLACE_UPDATE_SUPPORT = + new ConfigOption<>( + "rocksdb.inplace_update_support", + "Allows thread-safe inplace updates if a put key exists " + + "in current memtable and sizeof new value is smaller.", + disallowEmpty(), + false + ); + public static final ConfigOption DYNAMIC_LEVEL_BYTES = new ConfigOption<>( "rocksdb.level_compaction_dynamic_level_bytes", @@ -404,6 +518,89 @@ public static synchronized RocksDBOptions instance() { false ); + public static final ConfigOption USE_FSYNC = + new ConfigOption<>( + "rocksdb.use_fsync", + "If true, then every store to stable storage will issue a fsync.", + disallowEmpty(), + false + ); + + public static final ConfigOption ATOMIC_FLUSH = + new ConfigOption<>( + "rocksdb.atomic_flush", + "If true, flushing multiple column families and committing " + + "their results atomically to MANIFEST. Note that it's not " + + "necessary to set atomic_flush=true if WAL is always enabled.", + disallowEmpty(), + false + ); + + public static final ConfigOption TABLE_FORMAT_VERSION = + new ConfigOption<>( + "rocksdb.format_version", + "The format version of BlockBasedTable, allowed values are 0~5.", + rangeInt(0, 5), + 5 + ); + + public static final ConfigConvOption INDEX_TYPE = + new ConfigConvOption<>( + "rocksdb.index_type", + "The index type used to lookup between data blocks " + + "with the sst table, allowed values are [kBinarySearch," + + "kHashSearch,kTwoLevelIndexSearch,kBinarySearchWithFirstKey].", + allowValues("kBinarySearch", "kHashSearch", + "kTwoLevelIndexSearch", "kBinarySearchWithFirstKey"), + IndexType::valueOf, + "kBinarySearch" + ); + + public static final ConfigConvOption DATA_BLOCK_SEARCH_TYPE = + new ConfigConvOption<>( + "rocksdb.data_block_index_type", + "The search type used to point lookup in data block with " + + "the sst table, allowed values are [kDataBlockBinarySearch," + + "kDataBlockBinaryAndHash].", + allowValues("kDataBlockBinarySearch", "kDataBlockBinaryAndHash"), + DataBlockIndexType::valueOf, + "kDataBlockBinarySearch" + ); + + public static final ConfigOption DATA_BLOCK_HASH_TABLE_RATIO = + new ConfigOption<>( + "rocksdb.data_block_hash_table_util_ratio", + "The hash table utilization ratio value of entries/buckets. " + + "It is valid only when data_block_index_type=kDataBlockBinaryAndHash.", + rangeDouble(0.0, 1.0), + 0.75 + ); + + public static final ConfigOption BLOCK_SIZE = + new ConfigOption<>( + "rocksdb.block_size", + "Approximate size of user data packed per block, Note that " + + "it corresponds to uncompressed data.", + rangeInt(0L, Long.MAX_VALUE), + 4L * Bytes.KB + ); + + public static final ConfigOption BLOCK_SIZE_DEVIATION = + new ConfigOption<>( + "rocksdb.block_size_deviation", + "The percentage of free space used to close a block.", + rangeInt(0, 100), + 10 + ); + + public static final ConfigOption BLOCK_RESTART_INTERVAL = + new ConfigOption<>( + "rocksdb.block_restart_interval", + "The block restart interval for delta encoding in blocks.", + rangeInt(0, Integer.MAX_VALUE), + 16 + ); + public static final ConfigOption BLOCK_CACHE_CAPACITY = new ConfigOption<>( "rocksdb.block_cache_capacity", @@ -413,28 +610,31 @@ public static synchronized RocksDBOptions instance() { 8L * Bytes.MB ); - public static final ConfigOption PIN_L0_FILTER_AND_INDEX_IN_CACHE = + public static final ConfigOption CACHE_FILTER_AND_INDEX = new ConfigOption<>( - "rocksdb.pin_l0_filter_and_index_blocks_in_cache", - "Indicating if we'd put index/filter blocks to the block cache.", + "rocksdb.cache_index_and_filter_blocks", + "Set this option true if we'd put index/filter blocks to " + + "the block cache.", disallowEmpty(), - false + true ); - public static final ConfigOption PUT_FILTER_AND_INDEX_IN_CACHE = + public static final ConfigOption PIN_L0_INDEX_AND_FILTER = new ConfigOption<>( - "rocksdb.cache_index_and_filter_blocks", - "Indicating if we'd put index/filter blocks to the block cache.", + "rocksdb.pin_l0_filter_and_index_blocks_in_cache", + "Set this option true if we'd pin L0 index/filter blocks to " + + "the block cache.", disallowEmpty(), - false + true ); public static final ConfigOption BLOOM_FILTER_BITS_PER_KEY = new ConfigOption<>( "rocksdb.bloom_filter_bits_per_key", "The bits per key in bloom filter, a good value is 10, " + - "which yields a filter with ~ 1% false positive rate, " + - "-1 means no bloom filter.", + "which yields a filter with ~ 1% false positive rate. " + + "Set bloom_filter_bits_per_key > 0 to enable bloom filter, " + + "-1 means no bloom filter (0~0.5 round down to no filter).", rangeInt(-1, Integer.MAX_VALUE), -1 ); @@ -442,7 +642,8 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTER_MODE = new ConfigOption<>( "rocksdb.bloom_filter_block_based_mode", - "Use block based filter rather than full filter.", + "If bloom filter is enabled, set this option true to " + + "use block based filter rather than full filter.", disallowEmpty(), false ); @@ -450,8 +651,9 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTER_WHOLE_KEY = new ConfigOption<>( "rocksdb.bloom_filter_whole_key_filtering", - "True if place whole keys in the bloom filter, " + - "else place the prefix of keys.", + "If bloom filter is enabled, set this option true to " + + "place whole keys in the bloom filter, else place the " + + "prefix of keys when prefix-extractor is set.", disallowEmpty(), true ); @@ -459,8 +661,41 @@ public static synchronized RocksDBOptions instance() { public static final ConfigOption BLOOM_FILTERS_SKIP_LAST_LEVEL = new ConfigOption<>( "rocksdb.optimize_filters_for_hits", - "This flag allows us to not store filters for the last level.", + "If bloom filter is enabled, this flag allows us to not " + + "store filters for the last level. set this option true to " + + "optimize the filters mainly for cases where keys are found " + + "rather than also optimize for keys missed.", + disallowEmpty(), + true + ); + + public static final ConfigOption PARTITION_FILTERS_INDEXES = + new ConfigOption<>( + "rocksdb.partition_filters_and_indexes", + "If bloom filter is enabled, set this option true to use " + + "partitioned full filters and indexes for each sst file. " + + "This option is incompatible with block-based filters.", disallowEmpty(), false ); + + public static final ConfigOption PIN_TOP_INDEX_AND_FILTER = + new ConfigOption<>( + "rocksdb.pin_top_level_index_and_filter", + "If partition_filters_and_indexes is set true, set this " + + "option true if we'd pin top-level index of partitioned " + + "filter and index blocks to the block cache.", + disallowEmpty(), + true + ); + + public static final ConfigOption PREFIX_EXTRACTOR_CAPPED = + new ConfigOption<>( + "rocksdb.prefix_extractor_n_bytes", + "The prefix-extractor uses the first N bytes of a key as its prefix, " + + "it will use the full key when a key is shorter than the N. " + + "0 means unset prefix-extractor.", + rangeInt(0, Integer.MAX_VALUE), + 0 + ); } diff --git a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java index 75766e5daf..06652152cd 100644 --- a/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java +++ b/hugegraph-rocksdb/src/main/java/com/baidu/hugegraph/backend/store/rocksdb/RocksDBStdSessions.java @@ -44,6 +44,7 @@ import org.rocksdb.DBOptions; import org.rocksdb.DBOptionsInterface; import org.rocksdb.Env; +import org.rocksdb.IndexType; import org.rocksdb.InfoLogLevel; import org.rocksdb.LRUCache; import org.rocksdb.MutableColumnFamilyOptionsInterface; @@ -53,6 +54,7 @@ import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; import org.rocksdb.SstFileManager; +import org.rocksdb.TableFormatConfig; import org.rocksdb.WriteBatch; import org.rocksdb.WriteOptions; import org.slf4j.Logger; @@ -475,6 +477,8 @@ public static void initOptions(HugeConfig conf, * should we use options.setCreateMissingColumnFamilies()? */ db.setCreateIfMissing(true); + db.setWriteDbidToManifest(true); + db.setAvoidUnnecessaryBlockingIO(true); // Optimize RocksDB if (optimize) { @@ -500,16 +504,30 @@ public static void initOptions(HugeConfig conf, db.setUseDirectIoForFlushAndCompaction( conf.get(RocksDBOptions.USE_DIRECT_READS_WRITES_FC)); + db.setUseFsync(conf.get(RocksDBOptions.USE_FSYNC)); + + db.setAtomicFlush(conf.get(RocksDBOptions.ATOMIC_FLUSH)); + db.setMaxManifestFileSize( conf.get(RocksDBOptions.MAX_MANIFEST_FILE_SIZE)); db.setSkipStatsUpdateOnDbOpen( conf.get(RocksDBOptions.SKIP_STATS_UPDATE_ON_DB_OPEN)); + db.setSkipCheckingSstFileSizesOnDbOpen( + conf.get(RocksDBOptions.SKIP_CHECK_SIZE_ON_DB_OPEN)); db.setMaxFileOpeningThreads( conf.get(RocksDBOptions.MAX_FILE_OPENING_THREADS)); db.setDbWriteBufferSize(conf.get(RocksDBOptions.DB_MEMTABLE_SIZE)); + + db.setLogReadaheadSize(conf.get(RocksDBOptions.LOG_READAHEAD_SIZE)); + + // A global cache for table-level rows + long cacheCapacity = conf.get(RocksDBOptions.ROW_CACHE_CAPACITY); + if (cacheCapacity > 0) { + db.setRowCache(new LRUCache(cacheCapacity)); + } } if (mdb != null) { @@ -527,6 +545,14 @@ public static void initOptions(HugeConfig conf, mdb.setMaxTotalWalSize(conf.get(RocksDBOptions.MAX_TOTAL_WAL_SIZE)); + mdb.setBytesPerSync(conf.get(RocksDBOptions.BYTES_PER_SYNC)); + mdb.setWalBytesPerSync(conf.get(RocksDBOptions.WAL_BYTES_PER_SYNC)); + mdb.setStrictBytesPerSync( + conf.get(RocksDBOptions.STRICT_BYTES_PER_SYNC)); + + mdb.setCompactionReadaheadSize( + conf.get(RocksDBOptions.COMPACTION_READAHEAD_SIZE)); + mdb.setDeleteObsoleteFilesPeriodMicros(1000000 * conf.get(RocksDBOptions.DELETE_OBSOLETE_FILE_PERIOD)); } @@ -563,37 +589,23 @@ public static void initOptions(HugeConfig conf, cf.setMaxWriteBufferNumberToMaintain( conf.get(RocksDBOptions.MAX_MEMTABLES_TO_MAINTAIN)); + cf.setInplaceUpdateSupport( + conf.get(RocksDBOptions.MEMTABLE_INPLACE_UPDATE_SUPPORT)); + cf.setLevelCompactionDynamicLevelBytes( conf.get(RocksDBOptions.DYNAMIC_LEVEL_BYTES)); - // https://github.com/facebook/rocksdb/wiki/Block-Cache - BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); - long cacheCapacity = conf.get(RocksDBOptions.BLOCK_CACHE_CAPACITY); - if (cacheCapacity <= 0L) { - // Bypassing bug https://github.com/facebook/rocksdb/pull/5465 - tableConfig.setNoBlockCache(true); - } else { - tableConfig.setBlockCache(new LRUCache(cacheCapacity)); - } - tableConfig.setPinL0FilterAndIndexBlocksInCache( - conf.get(RocksDBOptions.PIN_L0_FILTER_AND_INDEX_IN_CACHE)); - tableConfig.setCacheIndexAndFilterBlocks( - conf.get(RocksDBOptions.PUT_FILTER_AND_INDEX_IN_CACHE)); - - // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter - int bitsPerKey = conf.get(RocksDBOptions.BLOOM_FILTER_BITS_PER_KEY); - if (bitsPerKey >= 0) { - boolean blockBased = conf.get(RocksDBOptions.BLOOM_FILTER_MODE); - tableConfig.setFilterPolicy(new BloomFilter(bitsPerKey, - blockBased)); - } - tableConfig.setWholeKeyFiltering( - conf.get(RocksDBOptions.BLOOM_FILTER_WHOLE_KEY)); - cf.setTableFormatConfig(tableConfig); - cf.setOptimizeFiltersForHits( conf.get(RocksDBOptions.BLOOM_FILTERS_SKIP_LAST_LEVEL)); + cf.setTableFormatConfig(initTableConfig(conf)); + + // CappedPrefixExtractor uses the first N bytes + int prefixLength = conf.get(RocksDBOptions.PREFIX_EXTRACTOR_CAPPED); + if (prefixLength > 0) { + cf.useCappedPrefixExtractor(prefixLength); + } + // https://github.com/facebook/rocksdb/tree/master/utilities/merge_operators cf.setMergeOperatorName("uint64add"); // uint64add/stringappend } @@ -626,6 +638,20 @@ public static void initOptions(HugeConfig conf, mcf.setHardPendingCompactionBytesLimit( conf.get(RocksDBOptions.HARD_PENDING_COMPACTION_LIMIT)); + /* + * TODO: also set memtable options: + * memtable_insert_with_hint_prefix_extractor + * The reason why use option name `memtable_bloom_size_ratio`: + * https://github.com/facebook/rocksdb/pull/9453/files + * #diff-cde52d1fcbcce2bc6aae27838f1d3e7e9e469ccad8aaf8f2695f939e279d7501R369 + */ + mcf.setMemtablePrefixBloomSizeRatio( + conf.get(RocksDBOptions.MEMTABLE_BLOOM_SIZE_RATIO)); + mcf.setMemtableWholeKeyFiltering( + conf.get(RocksDBOptions.MEMTABLE_BLOOM_WHOLE_KEY_FILTERING)); + mcf.setMemtableHugePageSize( + conf.get(RocksDBOptions.MEMTABL_BLOOM_HUGE_PAGE_SIZE)); + boolean bulkload = conf.get(RocksDBOptions.BULKLOAD_MODE); if (bulkload) { // Disable automatic compaction @@ -645,6 +671,79 @@ public static void initOptions(HugeConfig conf, } } + public static TableFormatConfig initTableConfig(HugeConfig conf) { + BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); + + tableConfig.setFormatVersion( + conf.get(RocksDBOptions.TABLE_FORMAT_VERSION)); + + /* + * The index type used to lookup between data blocks: + * https://github.com/facebook/rocksdb/wiki/Index-Block-Format + * + * TODO: support more index options: + * tableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparators); + * tableConfig.setEnableIndexCompression(true); + * tableConfig.setIndexBlockRestartInterval(1); + */ + tableConfig.setIndexType(conf.get(RocksDBOptions.INDEX_TYPE)); + + /* + * The search type of point lookup can be BinarySearch or HashSearch: + * https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index + */ + tableConfig.setDataBlockIndexType( + conf.get(RocksDBOptions.DATA_BLOCK_SEARCH_TYPE)); + tableConfig.setDataBlockHashTableUtilRatio( + conf.get(RocksDBOptions.DATA_BLOCK_HASH_TABLE_RATIO)); + + long blockSize = conf.get(RocksDBOptions.BLOCK_SIZE); + tableConfig.setBlockSize(blockSize); + tableConfig.setBlockSizeDeviation( + conf.get(RocksDBOptions.BLOCK_SIZE_DEVIATION)); + tableConfig.setBlockRestartInterval( + conf.get(RocksDBOptions.BLOCK_RESTART_INTERVAL)); + + // https://github.com/facebook/rocksdb/wiki/Block-Cache + long cacheCapacity = conf.get(RocksDBOptions.BLOCK_CACHE_CAPACITY); + if (cacheCapacity <= 0L) { + // Bypassing bug https://github.com/facebook/rocksdb/pull/5465 + tableConfig.setNoBlockCache(true); + } else { + tableConfig.setBlockCache(new LRUCache(cacheCapacity)); + } + + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + int bitsPerKey = conf.get(RocksDBOptions.BLOOM_FILTER_BITS_PER_KEY); + if (bitsPerKey >= 0) { + // TODO: use space-saving RibbonFilterPolicy + boolean blockBased = conf.get(RocksDBOptions.BLOOM_FILTER_MODE); + tableConfig.setFilterPolicy(new BloomFilter(bitsPerKey, + blockBased)); + + tableConfig.setWholeKeyFiltering( + conf.get(RocksDBOptions.BLOOM_FILTER_WHOLE_KEY)); + + tableConfig.setCacheIndexAndFilterBlocks( + conf.get(RocksDBOptions.CACHE_FILTER_AND_INDEX)); + tableConfig.setPinL0FilterAndIndexBlocksInCache( + conf.get(RocksDBOptions.PIN_L0_INDEX_AND_FILTER)); + + // https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters + if (conf.get(RocksDBOptions.PARTITION_FILTERS_INDEXES)) { + // Enable partitioned indexes and partitioned filters + tableConfig.setPartitionFilters(true) + .setIndexType(IndexType.kTwoLevelIndexSearch) + .setMetadataBlockSize(blockSize) + .setCacheIndexAndFilterBlocksWithHighPriority(true); + tableConfig.setPinTopLevelIndexAndFilter( + conf.get(RocksDBOptions.PIN_TOP_INDEX_AND_FILTER)); + } + } + + return tableConfig; + } + public static final byte[] encode(String string) { return StringEncoding.encode(string); } @@ -662,13 +761,13 @@ private final class StdSession extends RocksDBSessions.Session { private WriteOptions writeOptions; public StdSession(HugeConfig conf) { - boolean raftMode = conf.get(CoreOptions.RAFT_MODE); this.batch = new WriteBatch(); this.writeOptions = new WriteOptions(); /* * When work under raft mode. if store crashed, the state-machine * can restore by snapshot + raft log, doesn't need wal and sync */ + boolean raftMode = conf.get(CoreOptions.RAFT_MODE); if (raftMode) { this.writeOptions.setDisableWAL(true); this.writeOptions.setSync(false); @@ -938,9 +1037,13 @@ public BackendColumnIterator scan(String table, byte[] prefix) { assert !this.hasChanges(); /* * NOTE: Options.prefix_extractor is a prerequisite for - * Options.setPrefixSameAsStart(true): - * ReadOptions options = new ReadOptions(); - * options.setPrefixSameAsStart(true); + * optimized prefix seek, if Options.prefix_extractor if enabled, + * can setPrefixSameAsStart(true) or setAutoPrefixMode(true): + * ReadOptions options = new ReadOptions(); + * options.setPrefixSameAsStart(true); + * or + * options.setAutoPrefixMode(true); + * options.setIterateUpperBound(prefix + 1); */ try (CFHandle cf = cf(table)) { ReusedRocksIterator iter = cf.newIterator(); @@ -957,9 +1060,14 @@ public BackendColumnIterator scan(String table, byte[] keyFrom, byte[] keyTo, int scanType) { assert !this.hasChanges(); /* - * Not sure if setTotalOrderSeek(true) must be set: - * ReadOptions options = new ReadOptions(); - * options.setTotalOrderSeek(true); + * NOTE: if Options.prefix_extractor if enabled, need to + * setTotalOrderSeek(true) or setAutoPrefixMode(true) to make + * page-seek or shard-scan return right results: + * ReadOptions options = new ReadOptions(); + * options.setTotalOrderSeek(true); + * or + * options.setAutoPrefixMode(true); + * options.setIterateUpperBound(keyTo); */ try (CFHandle cf = cf(table)) { ReusedRocksIterator iter = cf.newIterator(); @@ -1059,7 +1167,7 @@ private boolean match(int expected) { @SuppressWarnings("unused") private void dump() { this.seek(); - LOG.info(">>>> scan from {}: {}{}", + LOG.info(">>>> scan from {}: {}{}", this.table, this.keyBegin == null ? "*" : StringEncoding.format(this.keyBegin), this.iter.isValid() ? "" : " - No data"); @@ -1097,7 +1205,7 @@ public boolean hasNext() { } private void seek() { - if (this.keyBegin == null) { + if (this.keyBegin == null || this.keyBegin.length <= 0) { // Seek to the first if no `keyBegin` this.iter.seekToFirst(); } else { diff --git a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java index 2e870d858e..3abc3c936e 100644 --- a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java +++ b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/rocksdb/RocksDBSessionTest.java @@ -519,9 +519,19 @@ public void testDeleteByRangeWithSignedBytes() throws RocksDBException { byte[] value21 = getBytes("value-2-1"); session.put(TABLE, key21, value21); - session.deleteRange(TABLE, new byte[]{1, -3}, new byte[]{1, 3}); this.commit(); + // TODO: enable after fixed rocksdb issue #8239 + /* + session.deleteRange(TABLE, new byte[]{1, -3}, new byte[]{1, 3}); + Assert.assertThrows(BackendException.class, () -> { + this.commit(); + }, e -> { + Assert.assertContains("end key comes before start key", + e.getCause().getMessage()); + }); + */ + Assert.assertArrayEquals(value11, session.get(TABLE, key11)); Assert.assertArrayEquals(value12, session.get(TABLE, key12)); Assert.assertArrayEquals(value21, session.get(TABLE, key21));