Skip to content

Commit

Permalink
blockstore: update next_slots during failure even if missing parent
Browse files Browse the repository at this point in the history
clear_unconfirmed_slot can leave blockstore in an irrecoverable state
if it panics. defer all panics until next_slots has been updated.

additionally relax the constraint that the parent slot meta must exist,
as it could have been cleaned up if outdated.
  • Loading branch information
AshwinSekar committed Feb 7, 2024
1 parent 070a5a3 commit e088ed3
Showing 1 changed file with 43 additions and 21 deletions.
64 changes: 43 additions & 21 deletions ledger/src/blockstore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,26 @@ impl Blockstore {
self.completed_slots_senders.lock().unwrap().clear();
}

/// Remove `child_slot` from the `slot_meta` of `parent_slot`.
/// Logs error if `slot_meta` of `parent_slot` is not present.
fn clear_parent_next_slot(&self, parent_slot: Slot, child_slot: Slot) -> Result<()> {
let parent_slot_meta = self.meta(parent_slot)?;
if let Some(mut parent_slot_meta) = parent_slot_meta {
// .retain() is a linear scan; however, next_slots should
// only contain several elements so this isn't so bad
parent_slot_meta
.next_slots
.retain(|&next_slot| next_slot != child_slot);
self.meta_cf.put(parent_slot, &parent_slot_meta)
} else {
error!("Parent slot meta {} for child {} is missing. In the absence of a duplicate block this
likely means a cluster restart was performed and your node contains invalid shreds generated
with the wrong shred version, whose ancestors have been cleaned up.
Falling back to duplicate block handling to remedy the situation", parent_slot, child_slot);
Ok(())
}
}

/// Range-delete all entries which prefix matches the specified `slot`,
/// remove `slot` its' parents SlotMeta next_slots list, and
/// clear `slot`'s SlotMeta (except for next_slots).
Expand All @@ -1174,30 +1194,32 @@ impl Blockstore {
.expect("Couldn't fetch from SlotMeta column family")
{
// Clear all slot related information
self.run_purge(slot, slot, PurgeType::Exact)
.expect("Purge database operations failed");
let run_purge_result = self.run_purge(slot, slot, PurgeType::Exact);

// Clear this slot as a next slot from parent
if let Some(parent_slot) = slot_meta.parent_slot {
let mut parent_slot_meta = self
.meta(parent_slot)
.expect("Couldn't fetch from SlotMeta column family")
.expect("Unconfirmed slot should have had parent slot set");
// .retain() is a linear scan; however, next_slots should
// only contain several elements so this isn't so bad
parent_slot_meta
.next_slots
.retain(|&next_slot| next_slot != slot);
self.meta_cf
.put(parent_slot, &parent_slot_meta)
.expect("Couldn't insert into SlotMeta column family");
}
// Reinsert parts of `slot_meta` that are important to retain, like the `next_slots`
// field.
let parent_purge_result = if let Some(parent_slot) = slot_meta.parent_slot {
self.clear_parent_next_slot(parent_slot, slot)
} else {
Ok(())
};

// Reinsert parts of `slot_meta` that are important to retain, like the `next_slots` field.
//
// It is important we do this even if any of the above panics as an invalid `next_slots`
// is usually unrecoverable.
slot_meta.clear_unconfirmed_slot();
self.meta_cf
.put(slot, &slot_meta)
.expect("Couldn't insert into SlotMeta column family");
self.meta_cf.put(slot, &slot_meta).expect(
"Couldn't reinsert `next_slots` after purging.
Blockstore might be in an inconsistent state requiring manual intervention",
);
run_purge_result.expect(
"Purge database operations failed.
Blockstore might be in an inconsistent state requiring manual intervention",
);
parent_purge_result.expect(
"Clearing parent next slot failed.
Blockstore might be in an inconsistent state requiring manual intervention",
)
} else {
error!(
"clear_unconfirmed_slot() called on slot {} with no SlotMeta",
Expand Down

0 comments on commit e088ed3

Please sign in to comment.