From fd8ae4161dacc0ca8d0a92485a30bcba5889d2d2 Mon Sep 17 00:00:00 2001 From: Zekun Li Date: Mon, 26 Feb 2024 13:46:24 -0800 Subject: [PATCH] [consensus] fix the race condition with the order of operations We send out the epoch change notification before spawning the commit task, so there's race condition that epoch manager shuts down the buffer manager before the request created and we lose the final commit task. --- consensus/src/pipeline/buffer_manager.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/consensus/src/pipeline/buffer_manager.rs b/consensus/src/pipeline/buffer_manager.rs index f7b48d134477c..d5fe9bae4447e 100644 --- a/consensus/src/pipeline/buffer_manager.rs +++ b/consensus/src/pipeline/buffer_manager.rs @@ -357,17 +357,7 @@ impl BufferManager { self.commit_proof_rb_handle .replace(self.do_reliable_broadcast(commit_decision)); } - if aggregated_item.commit_proof.ledger_info().ends_epoch() { - self.commit_msg_tx - .send_epoch_change(EpochChangeProof::new( - vec![aggregated_item.commit_proof.clone()], - false, - )) - .await; - // the epoch ends, reset to avoid executing more blocks, execute after - // this persisting request will result in BlockNotFound - self.reset().await; - } + let commit_proof = aggregated_item.commit_proof.clone(); self.persisting_phase_tx .send(self.create_new_request(PersistingRequest { blocks: blocks_to_persist, @@ -382,6 +372,15 @@ impl BufferManager { })) .await .expect("Failed to send persist request"); + // this needs to be done after creating the persisting request to avoid it being lost + if commit_proof.ledger_info().ends_epoch() { + self.commit_msg_tx + .send_epoch_change(EpochChangeProof::new(vec![commit_proof], false)) + .await; + // the epoch ends, reset to avoid executing more blocks, execute after + // this persisting request will result in BlockNotFound + self.reset().await; + } info!("Advance head to {:?}", self.buffer.head_cursor()); self.previous_commit_time = Instant::now(); return;