Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove bootstrapping retry config #2301

Merged
merged 12 commits into from
Nov 17, 2023
58 changes: 26 additions & 32 deletions chains/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,34 +174,32 @@ type ManagerConfig struct {
StakingBLSKey *bls.SecretKey
TracingEnabled bool
// Must not be used unless [TracingEnabled] is true as this may be nil.
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
RetryBootstrap bool // Should Bootstrap be retried
RetryBootstrapWarnFrequency int // Max number of times to retry bootstrap before warning the node operator
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
// ShutdownNodeFunc allows the chain manager to issue a request to shutdown the node
ShutdownNodeFunc func(exitCode int)
MeterVMEnabled bool // Should each VM be wrapped with a MeterVM
Expand Down Expand Up @@ -889,8 +887,6 @@ func (m *manager) createAvalancheChain(
Sender: snowmanMessageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
},
Expand Down Expand Up @@ -1235,8 +1231,6 @@ func (m *manager) createSnowmanChain(
Sender: messageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
}
Expand Down
2 changes: 0 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,6 @@ func getStateSyncConfig(v *viper.Viper) (node.StateSyncConfig, error) {

func getBootstrapConfig(v *viper.Viper, networkID uint32) (node.BootstrapConfig, error) {
config := node.BootstrapConfig{
RetryBootstrap: v.GetBool(RetryBootstrapKey),
RetryBootstrapWarnFrequency: v.GetInt(RetryBootstrapWarnFrequencyKey),
BootstrapBeaconConnectionTimeout: v.GetDuration(BootstrapBeaconConnectionTimeoutKey),
BootstrapMaxTimeGetAncestors: v.GetDuration(BootstrapMaxTimeGetAncestorsKey),
BootstrapAncestorsMaxContainersSent: int(v.GetUint(BootstrapAncestorsMaxContainersSentKey)),
Expand Down
2 changes: 0 additions & 2 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,6 @@ func addNodeFlags(fs *pflag.FlagSet) {
// TODO: combine "BootstrapIPsKey" and "BootstrapIDsKey" into one flag
fs.String(BootstrapIPsKey, "", "Comma separated list of bootstrap peer ips to connect to. Example: 127.0.0.1:9630,127.0.0.1:9631")
fs.String(BootstrapIDsKey, "", "Comma separated list of bootstrap peer ids to connect to. Example: NodeID-JR4dVmy6ffUGAKCBDkyCbeZbyHQBeDsET,NodeID-8CrVPQZ4VSqgL8zTdvL14G8HqAfrBr4z")
fs.Bool(RetryBootstrapKey, true, "Specifies whether bootstrap should be retried")
fs.Int(RetryBootstrapWarnFrequencyKey, 50, "Specifies how many times bootstrap should be retried before warning the operator")
fs.Duration(BootstrapBeaconConnectionTimeoutKey, time.Minute, "Timeout before emitting a warn log when connecting to bootstrapping beacons")
fs.Duration(BootstrapMaxTimeGetAncestorsKey, 50*time.Millisecond, "Max Time to spend fetching a container and its ancestors when responding to a GetAncestors")
fs.Uint(BootstrapAncestorsMaxContainersSentKey, 2000, "Max number of containers in an Ancestors message sent by this node")
Expand Down
2 changes: 0 additions & 2 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,6 @@ const (
RouterHealthMaxOutstandingRequestsKey = "router-health-max-outstanding-requests"
HealthCheckFreqKey = "health-check-frequency"
HealthCheckAveragerHalflifeKey = "health-check-averager-halflife"
RetryBootstrapKey = "bootstrap-retry-enabled"
RetryBootstrapWarnFrequencyKey = "bootstrap-retry-warn-frequency"
abi87 marked this conversation as resolved.
Show resolved Hide resolved
PluginDirKey = "plugin-dir"
BootstrapBeaconConnectionTimeoutKey = "bootstrap-beacon-connection-timeout"
BootstrapMaxTimeGetAncestorsKey = "bootstrap-max-time-get-ancestors"
Expand Down
6 changes: 0 additions & 6 deletions node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ type StateSyncConfig struct {
}

type BootstrapConfig struct {
// Should Bootstrap be retried
RetryBootstrap bool `json:"retryBootstrap"`

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int `json:"retryBootstrapWarnFrequency"`

// Timeout before emitting a warn log when connecting to bootstrapping beacons
BootstrapBeaconConnectionTimeout time.Duration `json:"bootstrapBeaconConnectionTimeout"`

Expand Down
2 changes: 0 additions & 2 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1009,8 +1009,6 @@ func (n *Node) initChainManager(avaxAssetID ids.ID) error {
CriticalChains: criticalChains,
TimeoutManager: n.timeoutManager,
Health: n.health,
RetryBootstrap: n.Config.RetryBootstrap,
RetryBootstrapWarnFrequency: n.Config.RetryBootstrapWarnFrequency,
ShutdownNodeFunc: n.Shutdown,
MeterVMEnabled: n.Config.MeterVMEnabled,
Metrics: n.MetricsGatherer,
Expand Down
28 changes: 5 additions & 23 deletions snow/engine/common/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type Bootstrapper interface {
AcceptedHandler
Haltable
Startup(context.Context) error
Restart(ctx context.Context, reset bool) error
Restart(ctx context.Context) error
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this all common bootstrapper is really only used by snowman chains (not avalanche ones anymore).
Should we just embed it into the snowman bootstrapped and be done with the common thing?
Maybe outside the scope of this PR but we'd prolly simplify code more aggressively

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the goal!

}

// It collects mechanisms common to both snowman and avalanche bootstrappers
Expand All @@ -41,9 +41,6 @@ type bootstrapper struct {

minority smbootstrapper.Poll
majority smbootstrapper.Poll

// number of times the bootstrap has been attempted
bootstrapAttempts int
}

func NewCommonBootstrapper(config Config) Bootstrapper {
Expand Down Expand Up @@ -146,7 +143,6 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
MaxOutstandingBroadcastRequests,
)

b.bootstrapAttempts++
if accepted, finalized := b.majority.Result(ctx); finalized {
b.Ctx.Log.Info("bootstrapping skipped",
zap.String("reason", "no provided bootstraps"),
Expand All @@ -158,22 +154,9 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
return b.sendMessagesOrFinish(ctx)
}

func (b *bootstrapper) Restart(ctx context.Context, reset bool) error {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restart(ctx, false) is replaced with Startup(ctx) and Restart(ctx, false) is replaced with Restart(ctx)

// resets the attempts when we're pulling blocks/vertices we don't want to
// fail the bootstrap at that stage
if reset {
b.Ctx.Log.Debug("Checking for new frontiers")

b.Config.SharedCfg.Restarted = true
b.bootstrapAttempts = 0
}

if b.bootstrapAttempts > 0 && b.bootstrapAttempts%b.RetryBootstrapWarnFrequency == 0 {
b.Ctx.Log.Debug("check internet connection",
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
}

func (b *bootstrapper) Restart(ctx context.Context) error {
b.Ctx.Log.Debug("Checking for new frontiers")
b.Config.SharedCfg.Restarted = true
return b.Startup(ctx)
}

Expand Down Expand Up @@ -207,9 +190,8 @@ func (b *bootstrapper) sendMessagesOrFinish(ctx context.Context) error {
b.Ctx.Log.Debug("restarting bootstrap",
zap.String("reason", "no blocks accepted"),
zap.Int("numBeacons", b.Beacons.Count(b.Ctx.SubnetID)),
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
return b.Restart(ctx, false /*=reset*/)
return b.Startup(ctx)
}

if !b.Config.SharedCfg.Restarted {
Expand Down
6 changes: 0 additions & 6 deletions snow/engine/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ type Config struct {
BootstrapTracker BootstrapTracker
Timer Timer

// Should Bootstrap be retried
RetryBootstrap bool

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int

// This node will only consider the first [AncestorsMaxContainersReceived]
// containers in an ancestors message it receives.
AncestorsMaxContainersReceived int
Expand Down
6 changes: 3 additions & 3 deletions snow/engine/snowman/bootstrap/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ func (b *bootstrapper) Timeout(ctx context.Context) error {
b.awaitingTimeout = false

if !b.Config.BootstrapTracker.IsBootstrapped() {
return b.Restart(ctx, true)
return b.Restart(ctx)
}
b.fetchETA.Set(0)
return b.OnFinished(ctx, b.Config.SharedCfg.RequestID)
Expand Down Expand Up @@ -591,8 +591,8 @@ func (b *bootstrapper) checkFinish(ctx context.Context) error {
// Note that executedBlocks < c*previouslyExecuted ( 0 <= c < 1 ) is enforced
// so that the bootstrapping process will terminate even as new blocks are
// being issued.
if b.Config.RetryBootstrap && executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
StephenButtolph marked this conversation as resolved.
Show resolved Hide resolved
return b.Restart(ctx, true)
if executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is why test tests needed to be updated (because b.Config.RetryBootstrap was false in a number of tests previously)

return b.Restart(ctx)
}

// If there is an additional callback, notify them that this chain has been
Expand Down
48 changes: 28 additions & 20 deletions snow/engine/snowman/bootstrap/bootstrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,6 @@ func TestBootstrapperUnknownByzantineResponse(t *testing.T) {

require.NoError(bs.Start(context.Background(), 0))

acceptedIDs := []ids.ID{blkID2}

parsedBlk1 := false
vm.GetBlockF = func(_ context.Context, blkID ids.ID) (snowman.Block, error) {
switch blkID {
Expand Down Expand Up @@ -390,31 +388,29 @@ func TestBootstrapperUnknownByzantineResponse(t *testing.T) {
return nil, errUnknownBlock
}

requestID := new(uint32)
var requestID uint32
sender.SendGetAncestorsF = func(_ context.Context, vdr ids.NodeID, reqID uint32, blkID ids.ID) {
require.Equal(peerID, vdr)
require.Equal(blkID1, blkID)
*requestID = reqID
requestID = reqID
}

vm.CantSetState = false
require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs)) // should request blk1

oldReqID := *requestID
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID+1, [][]byte{blkBytes1})) // respond with wrong request ID
require.Equal(oldReqID, *requestID)
require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2})) // should request blk1

require.NoError(bs.Ancestors(context.Background(), ids.BuildTestNodeID([]byte{1, 2, 3}), *requestID, [][]byte{blkBytes1})) // respond from wrong peer
require.Equal(oldReqID, *requestID)
oldReqID := requestID
require.NoError(bs.Ancestors(context.Background(), peerID, requestID, [][]byte{blkBytes0})) // respond with wrong block
require.NotEqual(oldReqID, requestID)

require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes0})) // respond with wrong block
require.NotEqual(oldReqID, *requestID)
require.NoError(bs.Ancestors(context.Background(), peerID, requestID, [][]byte{blkBytes1}))

require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes1}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

// There are multiple needed blocks and Ancestors returns one at a time
Expand Down Expand Up @@ -554,10 +550,13 @@ func TestBootstrapperPartialFetch(t *testing.T) {
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes1})) // respond with blk1
require.Equal(blkID1, requested)

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

// There are multiple needed blocks and some validators do not have all the blocks
Expand Down Expand Up @@ -714,7 +713,7 @@ func TestBootstrapperEmptyResponse(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), requestedVdr, requestID, [][]byte{blkBytes1})) // respond with blk1

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())
Expand Down Expand Up @@ -856,10 +855,13 @@ func TestBootstrapperAncestors(t *testing.T) {
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes2, blkBytes1})) // respond with blk2 and blk1
require.Equal(blkID2, requested)

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestBootstrapperFinalized(t *testing.T) {
Expand Down Expand Up @@ -976,10 +978,13 @@ func TestBootstrapperFinalized(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), peerID, reqIDBlk2, [][]byte{blkBytes2, blkBytes1}))

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestRestartBootstrapping(t *testing.T) {
Expand Down Expand Up @@ -1156,12 +1161,15 @@ func TestRestartBootstrapping(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), peerID, blk4RequestID, [][]byte{blkBytes4}))

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())
require.Equal(choices.Accepted, blk3.Status())
require.Equal(choices.Accepted, blk4.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID4}))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a small comment to easy up UTs maintenance?
Something like that we confirm the latest downlaoded block twice to make sure bootstrap is marked as done.
It could even be made once at the top of this test file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the comment to the actual bootstrapper struct

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestBootstrapOldBlockAfterStateSync(t *testing.T) {
Expand Down
Loading
Loading