Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(drive): uncommitted state if db transaction fails #2305

Merged
merged 16 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion packages/rs-drive-abci/src/abci/handler/finalize_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::execution::types::block_execution_context::v0::BlockExecutionContextV
use crate::platform_types::cleaned_abci_messages::finalized_block_cleaned_request::v0::FinalizeBlockCleanedRequest;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use std::sync::atomic::Ordering;
use tenderdash_abci::proto::abci as proto;

Expand Down Expand Up @@ -66,7 +67,20 @@ where
));
}

app.commit_transaction(platform_version)?;
// We had a chain halt on mainnet on block 32326. Compaction happened
// and transaction.commit() returned an error. Due to a bug in tenderdash,
// validators just proceeded on next block without committing data but keeping
// updated cache. To keep consistency with mainnet chain we have to skip
// commit of this block now on.
// TODO: verify that chain id is evo1
if !(app.platform().config.network == Network::Dash && block_height == 32326) {
// This is simplified solution until we have a better way to handle
// We still have caches in memory that corresponds to the data that
// we weren't able to commit. Solution is to restart the Drive, so all caches
// will be restored from the disk and try to process this block again
app.commit_transaction(platform_version)
.expect("commit transaction");
shumkov marked this conversation as resolved.
Show resolved Hide resolved
}

app.platform()
.committed_block_height_guard
Expand Down
48 changes: 42 additions & 6 deletions packages/rs-drive-abci/src/abci/handler/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use crate::abci::AbciError;
use crate::error::Error;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::DESIRED_PLATFORM_VERSION;
use tenderdash_abci::proto::abci as proto;

Expand All @@ -21,28 +22,63 @@ where

let platform_state = app.platform().state.load();

let state_app_hash = platform_state
let last_block_height = platform_state.last_committed_block_height() as i64;

// Verify that Platform State corresponds to Drive commited state
let drive_storage_root_hash = platform_state
.last_committed_block_app_hash()
.map(|app_hash| app_hash.to_vec())
.unwrap_or_default();

let platform_state_app_hash = app
.platform()
.drive
.grove
.root_hash(
None,
&platform_state
.current_platform_version()?
.drive
.grove_version,
)
.unwrap()?;
shumkov marked this conversation as resolved.
Show resolved Hide resolved

// We had a chain halt on mainnet on block 32326. Compaction happened
// and transaction.commit() returned an error. Due to a bug in tenderdash,
// validators just proceeded on next block without committing data but keeping
// updated cache. To keep consistency with mainnet chain we allow app hashes to be
// different for this block.
// TODO: verify that chain id is evo1
#[allow(clippy::collapsible_if)]
if !(app.platform().config.network == Network::Dash && last_block_height == 32326) {
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let desired_protocol_version = DESIRED_PLATFORM_VERSION.protocol_version;

let response = proto::ResponseInfo {
data: "".to_string(),
app_version: desired_protocol_version as u64,
last_block_height: platform_state.last_committed_block_height() as i64,
last_block_height,
version: env!("CARGO_PKG_VERSION").to_string(),
last_block_app_hash: state_app_hash.clone(),
last_block_app_hash: platform_state_app_hash.to_vec(),
};

tracing::debug!(
desired_protocol_version,
software_version = env!("CARGO_PKG_VERSION"),
block_version = request.block_version,
p2p_version = request.p2p_version,
app_hash = hex::encode(state_app_hash),
height = platform_state.last_committed_block_height(),
app_hash = hex::encode(platform_state_app_hash),
last_block_height,
"Handshake with consensus engine",
);

Expand Down
39 changes: 39 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::hashes::Hash;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand All @@ -35,6 +36,44 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let drive_storage_root_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let platform_state_app_hash = app
.platform()
.drive
.grove
.root_hash(
None,
&platform_state
.current_platform_version()?
.drive
.grove_version,
)
.unwrap()?;

// We had a chain halt on mainnet on block 32326. Compaction happened
// and transaction.commit() returned an error. Due to a bug in tenderdash,
// validators just proceeded on next block without committing data but keeping
// updated cache. To keep consistency with mainnet chain we allow app hashes to be
// different for this block.
// TODO: verify that chain id is evo1
shumkov marked this conversation as resolved.
Show resolved Hide resolved
#[allow(clippy::collapsible_if)]
if !(app.platform().config.network == Network::Dash && request.height == 32327) {
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let last_committed_core_height = platform_state.last_committed_core_height();

let starting_platform_version = platform_state.current_platform_version()?;
Expand Down
39 changes: 39 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/process_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::platform_types::block_execution_outcome;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand Down Expand Up @@ -179,6 +180,44 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let drive_storage_root_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let platform_state_app_hash = app
.platform()
.drive
.grove
.root_hash(
None,
&platform_state
.current_platform_version()?
.drive
.grove_version,
)
.unwrap()?;
shumkov marked this conversation as resolved.
Show resolved Hide resolved

// We had a chain halt on mainnet on block 32326. Compaction happened
// and transaction.commit() returned an error. Due to a bug in tenderdash,
// validators just proceeded on next block without committing data but keeping
// updated cache. To keep consistency with mainnet chain we allow app hashes to be
// different for this block.
// TODO: verify that chain id is evo1
shumkov marked this conversation as resolved.
Show resolved Hide resolved
#[allow(clippy::collapsible_if)]
if !(app.platform().config.network == Network::Dash && request.height == 32327) {
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
shumkov marked this conversation as resolved.
Show resolved Hide resolved
}

let starting_platform_version = platform_state.current_platform_version()?;

// Running the proposal executes all the state transitions for the block
Expand Down
6 changes: 0 additions & 6 deletions packages/rs-drive-abci/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,6 @@ struct PlatformConfigIntermediate {
#[serde(flatten)]
pub instant_lock: InstantLockConfig,
pub block_spacing_ms: u64,
#[serde(default = "PlatformConfig::default_initial_protocol_version")]
pub initial_protocol_version: ProtocolVersion,
pub db_path: PathBuf,
#[serde(default)]
pub rejections_path: Option<PathBuf>,
Expand Down Expand Up @@ -624,10 +622,6 @@ impl ExecutionConfig {
}

impl PlatformConfig {
fn default_initial_protocol_version() -> ProtocolVersion {
INITIAL_PROTOCOL_VERSION
}

fn default_network() -> Network {
Network::Dash
}
Expand Down
Loading