Skip to content

Commit

Permalink
change(state): Set iterator read bounds where possible in DiskDb (#7731)
Browse files Browse the repository at this point in the history
* Adds zs_iter_opts method

* uses checked_add

* uses zs_range_iter for other read methods

* fixes bug

* Fixes bug in zs_iter_opts

* Adds test & updates method docs

* updates docs

* Update zebra-state/src/service/finalized_state/disk_db/tests.rs

* Corrects code comment

* adds support for variable-sized keys

* adds test case

* Updates docs

* Applies suggestions from code review

* Add extra checks

* Fix test code and rustfmt

* fixes test

* fixes test

* adds reverse arg to new zs_range_iter calls

---------

Co-authored-by: teor <[email protected]>
  • Loading branch information
arya2 and teor2345 authored Oct 20, 2023
1 parent ab3ce9a commit 56e6305
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 52 deletions.
125 changes: 82 additions & 43 deletions zebra-state/src/service/finalized_state/disk_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::{
use itertools::Itertools;
use rlimit::increase_nofile_limit;

use rocksdb::ReadOptions;
use zebra_chain::parameters::Network;

use crate::{
Expand Down Expand Up @@ -194,7 +195,7 @@ pub trait ReadDisk {
fn zs_first_key_value<C, K, V>(&self, cf: &C) -> Option<(K, V)>
where
C: rocksdb::AsColumnFamilyRef,
K: FromDisk,
K: IntoDisk + FromDisk,
V: FromDisk;

/// Returns the highest key in `cf`, and the corresponding value.
Expand All @@ -203,7 +204,7 @@ pub trait ReadDisk {
fn zs_last_key_value<C, K, V>(&self, cf: &C) -> Option<(K, V)>
where
C: rocksdb::AsColumnFamilyRef,
K: FromDisk,
K: IntoDisk + FromDisk,
V: FromDisk;

/// Returns the first key greater than or equal to `lower_bound` in `cf`,
Expand Down Expand Up @@ -321,34 +322,22 @@ impl ReadDisk for DiskDb {
fn zs_first_key_value<C, K, V>(&self, cf: &C) -> Option<(K, V)>
where
C: rocksdb::AsColumnFamilyRef,
K: FromDisk,
K: IntoDisk + FromDisk,
V: FromDisk,
{
// Reading individual values from iterators does not seem to cause database hangs.
self.db
.iterator_cf(cf, rocksdb::IteratorMode::Start)
.next()?
.map(|(key_bytes, value_bytes)| {
Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes)))
})
.expect("unexpected database failure")
self.zs_range_iter(cf, .., false).next()
}

#[allow(clippy::unwrap_in_result)]
fn zs_last_key_value<C, K, V>(&self, cf: &C) -> Option<(K, V)>
where
C: rocksdb::AsColumnFamilyRef,
K: FromDisk,
K: IntoDisk + FromDisk,
V: FromDisk,
{
// Reading individual values from iterators does not seem to cause database hangs.
self.db
.iterator_cf(cf, rocksdb::IteratorMode::End)
.next()?
.map(|(key_bytes, value_bytes)| {
Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes)))
})
.expect("unexpected database failure")
self.zs_range_iter(cf, .., true).next()
}

#[allow(clippy::unwrap_in_result)]
Expand All @@ -358,17 +347,8 @@ impl ReadDisk for DiskDb {
K: IntoDisk + FromDisk,
V: FromDisk,
{
let lower_bound = lower_bound.as_bytes();
let from = rocksdb::IteratorMode::From(lower_bound.as_ref(), rocksdb::Direction::Forward);

// Reading individual values from iterators does not seem to cause database hangs.
self.db
.iterator_cf(cf, from)
.next()?
.map(|(key_bytes, value_bytes)| {
Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes)))
})
.expect("unexpected database failure")
self.zs_range_iter(cf, lower_bound.., false).next()
}

#[allow(clippy::unwrap_in_result)]
Expand All @@ -378,17 +358,8 @@ impl ReadDisk for DiskDb {
K: IntoDisk + FromDisk,
V: FromDisk,
{
let upper_bound = upper_bound.as_bytes();
let from = rocksdb::IteratorMode::From(upper_bound.as_ref(), rocksdb::Direction::Reverse);

// Reading individual values from iterators does not seem to cause database hangs.
self.db
.iterator_cf(cf, from)
.next()?
.map(|(key_bytes, value_bytes)| {
Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes)))
})
.expect("unexpected database failure")
self.zs_range_iter(cf, ..=upper_bound, true).next()
}

fn zs_items_in_range_ordered<C, K, V, R>(&self, cf: &C, range: R) -> BTreeMap<K, V>
Expand All @@ -398,7 +369,7 @@ impl ReadDisk for DiskDb {
V: FromDisk,
R: RangeBounds<K>,
{
self.zs_range_iter(cf, range).collect()
self.zs_range_iter(cf, range, false).collect()
}

fn zs_items_in_range_unordered<C, K, V, R>(&self, cf: &C, range: R) -> HashMap<K, V>
Expand All @@ -408,7 +379,7 @@ impl ReadDisk for DiskDb {
V: FromDisk,
R: RangeBounds<K>,
{
self.zs_range_iter(cf, range).collect()
self.zs_range_iter(cf, range, false).collect()
}
}

Expand All @@ -430,15 +401,24 @@ impl DiskWriteBatch {
impl DiskDb {
/// Returns an iterator over the items in `cf` in `range`.
///
/// Accepts a `reverse` argument. If it is `true`, creates the iterator with an
/// [`IteratorMode`](rocksdb::IteratorMode) of [`End`](rocksdb::IteratorMode::End), or
/// [`From`](rocksdb::IteratorMode::From) with [`Direction::Reverse`](rocksdb::Direction::Reverse).
///
/// Holding this iterator open might delay block commit transactions.
pub fn zs_range_iter<C, K, V, R>(&self, cf: &C, range: R) -> impl Iterator<Item = (K, V)> + '_
pub fn zs_range_iter<C, K, V, R>(
&self,
cf: &C,
range: R,
reverse: bool,
) -> impl Iterator<Item = (K, V)> + '_
where
C: rocksdb::AsColumnFamilyRef,
K: IntoDisk + FromDisk,
V: FromDisk,
R: RangeBounds<K>,
{
self.zs_range_iter_with_direction(cf, range, false)
self.zs_range_iter_with_direction(cf, range, reverse)
}

/// Returns a reverse iterator over the items in `cf` in `range`.
Expand Down Expand Up @@ -495,11 +475,12 @@ impl DiskDb {
let range = (start_bound, end_bound);

let mode = Self::zs_iter_mode(&range, reverse);
let opts = Self::zs_iter_opts(&range);

// Reading multiple items from iterators has caused database hangs,
// in previous RocksDB versions
self.db
.iterator_cf(cf, mode)
.iterator_cf_opt(cf, opts, mode)
.map(|result| result.expect("unexpected database failure"))
.map(|(key, value)| (key.to_vec(), value))
// Skip excluded "from" bound and empty ranges. The `mode` already skips keys
Expand All @@ -514,6 +495,64 @@ impl DiskDb {
.map(|(key, value)| (K::from_bytes(key), V::from_bytes(value)))
}

/// Returns the RocksDB ReadOptions with a lower and upper bound for a range.
fn zs_iter_opts<R>(range: &R) -> ReadOptions
where
R: RangeBounds<Vec<u8>>,
{
let mut opts = ReadOptions::default();
let (lower_bound, upper_bound) = Self::zs_iter_bounds(range);

if let Some(bound) = lower_bound {
opts.set_iterate_lower_bound(bound);
};

if let Some(bound) = upper_bound {
opts.set_iterate_upper_bound(bound);
};

opts
}

/// Returns a lower and upper iterate bounds for a range.
///
/// Note: Since upper iterate bounds are always exclusive in RocksDB, this method
/// will increment the upper bound by 1 if the end bound of the provided range
/// is inclusive.
fn zs_iter_bounds<R>(range: &R) -> (Option<Vec<u8>>, Option<Vec<u8>>)
where
R: RangeBounds<Vec<u8>>,
{
use std::ops::Bound::*;

let lower_bound = match range.start_bound() {
Included(bound) | Excluded(bound) => Some(bound.clone()),
Unbounded => None,
};

let upper_bound = match range.end_bound().cloned() {
Included(mut bound) => {
// Increment the last byte in the upper bound that is less than u8::MAX, and
// clear any bytes after it to increment the next key in lexicographic order
// (next big-endian number) this Vec represents to RocksDB.
let is_wrapped_overflow = bound.iter_mut().rev().all(|v| {
*v = v.wrapping_add(1);
v == &0
});

if is_wrapped_overflow {
bound.insert(0, 0x01)
}

Some(bound)
}
Excluded(bound) => Some(bound),
Unbounded => None,
};

(lower_bound, upper_bound)
}

/// Returns the RocksDB iterator "from" mode for `range`.
///
/// RocksDB iterators are ordered by increasing key bytes by default.
Expand Down
47 changes: 47 additions & 0 deletions zebra-state/src/service/finalized_state/disk_db/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,50 @@ impl DiskDb {
rocksdb::DB::list_cf(&opts, path)
}
}

/// Check that zs_iter_opts returns an upper bound one greater than provided inclusive end bounds.
#[test]
fn zs_iter_opts_increments_key_by_one() {
let _init_guard = zebra_test::init();

// TODO: add an empty key (`()` type or `[]` when serialized) test case
let keys: [u32; 14] = [
0,
1,
200,
255,
256,
257,
65535,
65536,
65537,
16777215,
16777216,
16777217,
16777218,
u32::MAX,
];

for key in keys {
let (_, bytes) = DiskDb::zs_iter_bounds(&..=key.to_be_bytes().to_vec());
let mut extra_bytes = bytes.expect("there should be an upper bound");
let bytes = extra_bytes.split_off(extra_bytes.len() - 4);
let upper_bound = u32::from_be_bytes(bytes.clone().try_into().expect("should be 4 bytes"));
let expected_upper_bound = key.wrapping_add(1);

assert_eq!(
expected_upper_bound, upper_bound,
"the upper bound should be 1 greater than the original key"
);

if expected_upper_bound == 0 {
assert_eq!(
extra_bytes,
vec![1],
"there should be an extra byte with a value of 1"
);
} else {
assert_eq!(extra_bytes.len(), 0, "there should be no extra bytes");
}
}
}
2 changes: 1 addition & 1 deletion zebra-state/src/service/finalized_state/zebra_db/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ impl ZebraDb {
) -> impl Iterator<Item = (RawBytes, Arc<HistoryTree>)> + '_ {
let history_tree_cf = self.db.cf_handle("history_tree").unwrap();

self.db.zs_range_iter(&history_tree_cf, ..)
self.db.zs_range_iter(&history_tree_cf, .., false)
}

// Value pool methods
Expand Down
14 changes: 7 additions & 7 deletions zebra-state/src/service/finalized_state/zebra_db/shielded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ impl ZebraDb {
&self,
) -> impl Iterator<Item = (RawBytes, Arc<sprout::tree::NoteCommitmentTree>)> + '_ {
let sprout_trees = self.db.cf_handle("sprout_note_commitment_tree").unwrap();
self.db.zs_range_iter(&sprout_trees, ..)
self.db.zs_range_iter(&sprout_trees, .., false)
}

// # Sapling trees
Expand Down Expand Up @@ -207,7 +207,7 @@ impl ZebraDb {
R: std::ops::RangeBounds<Height>,
{
let sapling_trees = self.db.cf_handle("sapling_note_commitment_tree").unwrap();
self.db.zs_range_iter(&sapling_trees, range)
self.db.zs_range_iter(&sapling_trees, range, false)
}

/// Returns the Sapling note commitment trees in the reversed range, in decreasing height order.
Expand Down Expand Up @@ -278,7 +278,7 @@ impl ZebraDb {
if let Some(exclusive_end_bound) = exclusive_end_bound {
list = self
.db
.zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound)
.zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound, false)
.collect();
} else {
// If there is no end bound, just return all the trees.
Expand All @@ -287,7 +287,7 @@ impl ZebraDb {
// the trees run out.)
list = self
.db
.zs_range_iter(&sapling_subtrees, start_index..)
.zs_range_iter(&sapling_subtrees, start_index.., false)
.collect();
}

Expand Down Expand Up @@ -377,7 +377,7 @@ impl ZebraDb {
R: std::ops::RangeBounds<Height>,
{
let orchard_trees = self.db.cf_handle("orchard_note_commitment_tree").unwrap();
self.db.zs_range_iter(&orchard_trees, range)
self.db.zs_range_iter(&orchard_trees, range, false)
}

/// Returns the Orchard note commitment trees in the reversed range, in decreasing height order.
Expand Down Expand Up @@ -448,7 +448,7 @@ impl ZebraDb {
if let Some(exclusive_end_bound) = exclusive_end_bound {
list = self
.db
.zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound)
.zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound, false)
.collect();
} else {
// If there is no end bound, just return all the trees.
Expand All @@ -457,7 +457,7 @@ impl ZebraDb {
// the trees run out.)
list = self
.db
.zs_range_iter(&orchard_subtrees, start_index..)
.zs_range_iter(&orchard_subtrees, start_index.., false)
.collect();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,11 @@ impl ZebraDb {
AddressTransaction::address_iterator_range(address_location, query_height_range);

self.db
.zs_range_iter(&tx_loc_by_transparent_addr_loc, transaction_location_range)
.zs_range_iter(
&tx_loc_by_transparent_addr_loc,
transaction_location_range,
false,
)
.map(|(tx_loc, ())| tx_loc)
.collect()
}
Expand Down

0 comments on commit 56e6305

Please sign in to comment.