Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

flakiness fix for forked chain test #10356

Merged
merged 1 commit into from
May 13, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions tests/nodeos_forked_chain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ def getBlock(self, blockNum):
# block number to start expecting node killed after
preKillBlockNum=nonProdNode.getBlockNum()
preKillBlockProducer=nonProdNode.getBlockProducerByNum(preKillBlockNum)
Print("preKillBlockProducer = {}".format(preKillBlockProducer))
# kill at last block before defproducerl, since the block it is killed on will get propagated
killAtProducer="defproducerk"
nonProdNode.killNodeOnProducer(producer=killAtProducer, whereInSequence=(inRowCountPerProducer-1))
Expand Down Expand Up @@ -407,8 +408,13 @@ def getBlock(self, blockNum):
(headBlockNum, libNumAroundDivergence)=getMinHeadAndLib(prodNodes)

# track the block number and producer from each producing node
blockProducer0=prodNodes[0].getBlockProducerByNum(blockNum)
blockProducer1=prodNodes[1].getBlockProducerByNum(blockNum)
# we use timeout 70 here because of case when chain break, call to getBlockProducerByNum
# and call of producer_plugin::schedule_delayed_production_loop happens nearly immediately
# for 10 producers wait cycle is 10 * (12*0.5) = 60 seconds.
# for 11 producers wait cycle is 11 * (12*0.5) = 66 seconds.
blockProducer0=prodNodes[0].getBlockProducerByNum(blockNum, timeout=70)
blockProducer1=prodNodes[1].getBlockProducerByNum(blockNum, timeout=70)
Print("blockNum = {} blockProducer0 = {} blockProducer1 = {}".format(blockNum, blockProducer0, blockProducer1))
blockProducers0.append({"blockNum":blockNum, "prod":blockProducer0})
blockProducers1.append({"blockNum":blockNum, "prod":blockProducer1})

Expand All @@ -417,19 +423,23 @@ def getBlock(self, blockNum):
if not prodChanged:
if preKillBlockProducer!=blockProducer0:
prodChanged=True
Print("prodChanged = True")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we keeping these print statements for diagnosing more failures over time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. it is hard to understand what is happening without those


#since it is killing for the last block of killAtProducer, we look for the next producer change
if not nextProdChange and prodChanged and blockProducer1==killAtProducer:
nextProdChange=True
Print("nextProdChange = True")
elif nextProdChange and blockProducer1!=killAtProducer:
nextProdChange=False
Print("nextProdChange = False")
if blockProducer0!=blockProducer1:
Print("Divergence identified at block %s, node_00 producer: %s, node_01 producer: %s" % (blockNum, blockProducer0, blockProducer1))
actualLastBlockNum=blockNum
break
else:
missedTransitionBlock=blockNum
transitionCount+=1
Print("missedTransitionBlock = {} transitionCount = ".format(missedTransitionBlock, transitionCount))
# allow this to transition twice, in case the script was identifying an earlier transition than the bridge node received the kill command
if transitionCount>1:
Print("At block %d and have passed producer: %s %d times and we have not diverged, stopping looking and letting errors report" % (blockNum, killAtProducer, transitionCount))
Expand Down